[llvm] [RISCV] Remove codegen for vp_ctlz, vp_cttz, vp_ctpop (PR #189904)

Luke Lau via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 1 00:58:53 PDT 2026


https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/189904

Part of the work to remove trivial VP intrinsics from the RISC-V backend, see https://discourse.llvm.org/t/rfc-remove-codegen-support-for-trivial-vp-intrinsics-in-the-risc-v-backend/87999

This splits off 3 intrinsics from #179622.

Note that vp.cttz is the elementwise version, not vp.cttz.elts.


>From 7cf823824f6d1abe01bf0d67a8f94ac6477f8cca Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 1 Apr 2026 15:56:11 +0800
Subject: [PATCH] [RISCV] Remove codegen for vp_ctlz, vp_cttz, vp_ctpop

Part of the work to remove trivial VP intrinsics from the RISC-V backend, see https://discourse.llvm.org/t/rfc-remove-codegen-support-for-trivial-vp-intrinsics-in-the-risc-v-backend/87999

This splits off 3 intrinsics from #179622.

Note that vp.cttz is the elementwise version, not vp.cttz.elts.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   85 +-
 .../Target/RISCV/RISCVTargetTransformInfo.h   |    3 -
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        | 1724 +++---
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       | 2253 ++++----
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        | 3344 ++++-------
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        | 4903 +++--------------
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       | 1806 +++---
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        | 4149 +++-----------
 8 files changed, 5207 insertions(+), 13060 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index cb6489fbeae09..a101d669fbd0b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1101,24 +1101,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       if (Subtarget.hasStdExtZvbb()) {
         setOperationAction(ISD::BITREVERSE, VT, Legal);
         setOperationAction(ISD::VP_BITREVERSE, VT, Custom);
-        setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
-                            ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
-                           VT, Custom);
       } else {
         setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand);
         setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand);
-        setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
-                            ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
-                           VT, Expand);
 
         // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
         // range of f32.
         EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
         if (isTypeLegal(FloatVT)) {
-          setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
-                              ISD::CTTZ_ZERO_UNDEF, ISD::VP_CTLZ,
-                              ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ_ZERO_UNDEF},
-                             VT, Custom);
+          setOperationAction(
+              {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+              Custom);
         }
       }
 
@@ -6895,15 +6888,6 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   SDLoc DL(Op);
   MVT ContainerVT = VT;
 
-  SDValue Mask, VL;
-  if (Op->isVPOpcode()) {
-    Mask = Op.getOperand(1);
-    if (VT.isFixedLengthVector())
-      Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
-                                     Subtarget);
-    VL = Op.getOperand(2);
-  }
-
   // We choose FP type that can represent the value if possible. Otherwise, we
   // use rounding to zero conversion for correct exponent of the result.
   // TODO: Use f16 for i8 when possible?
@@ -6923,27 +6907,19 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
     SDValue Neg = DAG.getNegative(Src, DL, VT);
     Src = DAG.getNode(ISD::AND, DL, VT, Src, Neg);
-  } else if (Op.getOpcode() == ISD::VP_CTTZ_ZERO_UNDEF) {
-    SDValue Neg = DAG.getNode(ISD::VP_SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                              Src, Mask, VL);
-    Src = DAG.getNode(ISD::VP_AND, DL, VT, Src, Neg, Mask, VL);
   }
 
   // We have a legal FP type, convert to it.
   SDValue FloatVal;
   if (FloatVT.bitsGT(VT)) {
-    if (Op->isVPOpcode())
-      FloatVal = DAG.getNode(ISD::VP_UINT_TO_FP, DL, FloatVT, Src, Mask, VL);
-    else
-      FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+    FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
   } else {
     // Use RTZ to avoid rounding influencing exponent of FloatVal.
     if (VT.isFixedLengthVector()) {
       ContainerVT = getContainerForFixedLengthVector(VT);
       Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
     }
-    if (!Op->isVPOpcode())
-      std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+    auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
     SDValue RTZRM =
         DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, Subtarget.getXLenVT());
     MVT ContainerFloatVT =
@@ -6958,20 +6934,13 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal);
   unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23;
 
-  SDValue Exp;
   // Restore back to original type. Truncation after SRL is to generate vnsrl.
-  if (Op->isVPOpcode()) {
-    Exp = DAG.getNode(ISD::VP_SRL, DL, IntVT, Bitcast,
-                      DAG.getConstant(ShiftAmt, DL, IntVT), Mask, VL);
-    Exp = DAG.getVPZExtOrTrunc(DL, VT, Exp, Mask, VL);
-  } else {
-    Exp = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
-                      DAG.getConstant(ShiftAmt, DL, IntVT));
-    if (IntVT.bitsLT(VT))
-      Exp = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Exp);
-    else if (IntVT.bitsGT(VT))
-      Exp = DAG.getNode(ISD::TRUNCATE, DL, VT, Exp);
-  }
+  SDValue Exp = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
+                            DAG.getConstant(ShiftAmt, DL, IntVT));
+  if (IntVT.bitsLT(VT))
+    Exp = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Exp);
+  else if (IntVT.bitsGT(VT))
+    Exp = DAG.getNode(ISD::TRUNCATE, DL, VT, Exp);
 
   // The exponent contains log2 of the value in biased form.
   unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127;
@@ -6979,27 +6948,18 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF)
     return DAG.getNode(ISD::SUB, DL, VT, Exp,
                        DAG.getConstant(ExponentBias, DL, VT));
-  if (Op.getOpcode() == ISD::VP_CTTZ_ZERO_UNDEF)
-    return DAG.getNode(ISD::VP_SUB, DL, VT, Exp,
-                       DAG.getConstant(ExponentBias, DL, VT), Mask, VL);
 
   // For leading zeros, we need to remove the bias and convert from log2 to
   // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)).
   unsigned Adjust = ExponentBias + (EltSize - 1);
-  SDValue Res;
-  if (Op->isVPOpcode())
-    Res = DAG.getNode(ISD::VP_SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Exp,
-                      Mask, VL);
-  else
-    Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Exp);
+  SDValue Res =
+      DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Exp);
 
   // The above result with zero input equals to Adjust which is greater than
   // EltSize. Hence, we can do min(Res, EltSize) for CTLZ.
   if (Op.getOpcode() == ISD::CTLZ)
     Res = DAG.getNode(ISD::UMIN, DL, VT, Res, DAG.getConstant(EltSize, DL, VT));
-  else if (Op.getOpcode() == ISD::VP_CTLZ)
-    Res = DAG.getNode(ISD::VP_UMIN, DL, VT, Res,
-                      DAG.getConstant(EltSize, DL, VT), Mask, VL);
+
   return Res;
 }
 
@@ -7648,14 +7608,9 @@ static unsigned getRISCVVLOp(SDValue Op) {
   VP_CASE(SSUBSAT)    // VP_SSUBSAT
   VP_CASE(USUBSAT)    // VP_USUBSAT
   VP_CASE(BSWAP)      // VP_BSWAP
-  VP_CASE(CTLZ)       // VP_CTLZ
-  VP_CASE(CTTZ)       // VP_CTTZ
-  VP_CASE(CTPOP)      // VP_CTPOP
   case ISD::CTLZ_ZERO_UNDEF:
-  case ISD::VP_CTLZ_ZERO_UNDEF:
     return RISCVISD::CTLZ_VL;
   case ISD::CTTZ_ZERO_UNDEF:
-  case ISD::VP_CTTZ_ZERO_UNDEF:
     return RISCVISD::CTTZ_VL;
   case ISD::FMA:
   case ISD::VP_FMA:
@@ -9136,18 +9091,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VP_BITREVERSE:
   case ISD::VP_BSWAP:
     return lowerVPOp(Op, DAG);
-  case ISD::VP_CTLZ:
-  case ISD::VP_CTLZ_ZERO_UNDEF:
-    if (Subtarget.hasStdExtZvbb())
-      return lowerVPOp(Op, DAG);
-    return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
-  case ISD::VP_CTTZ:
-  case ISD::VP_CTTZ_ZERO_UNDEF:
-    if (Subtarget.hasStdExtZvbb())
-      return lowerVPOp(Op, DAG);
-    return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
-  case ISD::VP_CTPOP:
-    return lowerVPOp(Op, DAG);
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     return lowerVPStridedLoad(Op, DAG);
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 18e0ab9b72608..e9ce271bdc6eb 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -377,9 +377,6 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
         Intrinsic::vp_bswap,
         Intrinsic::vp_ceil,
         Intrinsic::vp_copysign,
-        Intrinsic::vp_ctlz,
-        Intrinsic::vp_ctpop,
-        Intrinsic::vp_cttz,
         Intrinsic::vp_cttz_elts,
         Intrinsic::vp_fabs,
         Intrinsic::vp_fadd,
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 2ed155a491657..4c82c8abd4edd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -11,25 +11,22 @@
 define <vscale x 1 x i8> @vp_ctlz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i8> %v
@@ -38,7 +35,7 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 define <vscale x 1 x i8> @vp_ctlz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
@@ -52,7 +49,7 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -62,25 +59,22 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 define <vscale x 2 x i8> @vp_ctlz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i8> %v
@@ -89,7 +83,7 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 define <vscale x 2 x i8> @vp_ctlz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
@@ -103,7 +97,7 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -113,25 +107,22 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i8> %v
@@ -140,7 +131,7 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 define <vscale x 4 x i8> @vp_ctlz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
@@ -154,7 +145,7 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -164,25 +155,22 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i8> %v
@@ -191,7 +179,7 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 define <vscale x 8 x i8> @vp_ctlz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8
 ; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
@@ -205,7 +193,7 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -215,25 +203,22 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v16, v8
 ; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i8> %v
@@ -242,7 +227,7 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 define <vscale x 16 x i8> @vp_ctlz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v16, v8
 ; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
@@ -256,7 +241,7 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -266,32 +251,32 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 define <vscale x 32 x i8> @vp_ctlz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 2
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv32i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.ctlz.nxv32i8(<vscale x 32 x i8> %va, i1 false, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i8> %v
@@ -300,7 +285,7 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv32i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -324,7 +309,7 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv32i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.ctlz.nxv32i8(<vscale x 32 x i8> %va, i1 false, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -334,32 +319,32 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 define <vscale x 64 x i8> @vp_ctlz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 2
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv64i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.ctlz.nxv64i8(<vscale x 64 x i8> %va, i1 false, <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i8> %v
@@ -368,7 +353,7 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv64i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -392,7 +377,7 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv64i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.ctlz.nxv64i8(<vscale x 64 x i8> %va, i1 false, <vscale x 64 x i1> splat (i1 true), i32 %evl)
@@ -402,22 +387,19 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 define <vscale x 1 x i16> @vp_ctlz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i16> %v
@@ -426,7 +408,7 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 define <vscale x 1 x i16> @vp_ctlz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 23
@@ -437,7 +419,7 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -447,22 +429,19 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 define <vscale x 2 x i16> @vp_ctlz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i16> %v
@@ -471,7 +450,7 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 define <vscale x 2 x i16> @vp_ctlz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 23
@@ -482,7 +461,7 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -492,22 +471,19 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 define <vscale x 4 x i16> @vp_ctlz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i16> %v
@@ -516,7 +492,7 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 define <vscale x 4 x i16> @vp_ctlz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 23
@@ -527,7 +503,7 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -537,22 +513,19 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 define <vscale x 8 x i16> @vp_ctlz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i16> %v
@@ -561,7 +534,7 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 define <vscale x 8 x i16> @vp_ctlz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 23
@@ -572,7 +545,7 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -582,22 +555,19 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 define <vscale x 16 x i16> @vp_ctlz_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i16> %v
@@ -606,7 +576,7 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 define <vscale x 16 x i16> @vp_ctlz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 23
@@ -617,7 +587,7 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -627,41 +597,41 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 define <vscale x 32 x i16> @vp_ctlz_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 2
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 8
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv32i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 false, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -670,7 +640,7 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv32i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -703,7 +673,7 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv32i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 false, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -713,23 +683,20 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 define <vscale x 1 x i32> @vp_ctlz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i32> %v
@@ -738,7 +705,7 @@ define <vscale x 1 x i32> @vp_ctlz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 define <vscale x 1 x i32> @vp_ctlz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v9, a0
@@ -750,7 +717,7 @@ define <vscale x 1 x i32> @vp_ctlz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -760,23 +727,20 @@ define <vscale x 1 x i32> @vp_ctlz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 define <vscale x 2 x i32> @vp_ctlz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v10, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i32> %v
@@ -785,7 +749,7 @@ define <vscale x 2 x i32> @vp_ctlz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 define <vscale x 2 x i32> @vp_ctlz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v10, a0
@@ -797,7 +761,7 @@ define <vscale x 2 x i32> @vp_ctlz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -807,23 +771,20 @@ define <vscale x 2 x i32> @vp_ctlz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 define <vscale x 4 x i32> @vp_ctlz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v12, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i32> %v
@@ -832,7 +793,7 @@ define <vscale x 4 x i32> @vp_ctlz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 define <vscale x 4 x i32> @vp_ctlz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v12, a0
@@ -844,7 +805,7 @@ define <vscale x 4 x i32> @vp_ctlz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -854,23 +815,20 @@ define <vscale x 4 x i32> @vp_ctlz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 define <vscale x 8 x i32> @vp_ctlz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v16, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i32> %v
@@ -879,7 +837,7 @@ define <vscale x 8 x i32> @vp_ctlz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 define <vscale x 8 x i32> @vp_ctlz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v16, a0
@@ -891,7 +849,7 @@ define <vscale x 8 x i32> @vp_ctlz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -901,21 +859,21 @@ define <vscale x 8 x i32> @vp_ctlz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 158
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i32> %v
@@ -924,20 +882,20 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 define <vscale x 16 x i32> @vp_ctlz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    li a1, 158
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -947,22 +905,22 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 define <vscale x 1 x i64> @vp_ctlz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i64> %v
@@ -971,21 +929,21 @@ define <vscale x 1 x i64> @vp_ctlz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 define <vscale x 1 x i64> @vp_ctlz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -995,22 +953,22 @@ define <vscale x 1 x i64> @vp_ctlz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 define <vscale x 2 x i64> @vp_ctlz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -1019,21 +977,21 @@ define <vscale x 2 x i64> @vp_ctlz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 define <vscale x 2 x i64> @vp_ctlz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1043,22 +1001,22 @@ define <vscale x 2 x i64> @vp_ctlz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 define <vscale x 4 x i64> @vp_ctlz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %v
@@ -1067,21 +1025,21 @@ define <vscale x 4 x i64> @vp_ctlz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 define <vscale x 4 x i64> @vp_ctlz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1091,22 +1049,22 @@ define <vscale x 4 x i64> @vp_ctlz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 define <vscale x 7 x i64> @vp_ctlz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv7i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.ctlz.nxv7i64(<vscale x 7 x i64> %va, i1 false, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x i64> %v
@@ -1115,21 +1073,21 @@ define <vscale x 7 x i64> @vp_ctlz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
 define <vscale x 7 x i64> @vp_ctlz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv7i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.ctlz.nxv7i64(<vscale x 7 x i64> %va, i1 false, <vscale x 7 x i1> splat (i1 true), i32 %evl)
@@ -1139,22 +1097,22 @@ define <vscale x 7 x i64> @vp_ctlz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 define <vscale x 8 x i64> @vp_ctlz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %v
@@ -1163,21 +1121,21 @@ define <vscale x 8 x i64> @vp_ctlz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 define <vscale x 8 x i64> @vp_ctlz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1187,59 +1145,27 @@ define <vscale x 8 x i64> @vp_ctlz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    fsrmi a4, 1
-; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    sub a5, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    sltu a3, a0, a5
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a5, a3, a5
-; CHECK-NEXT:    li a3, 1086
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a4
-; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vrsub.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    li a4, 64
-; CHECK-NEXT:    vminu.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB46_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    vminu.vx v8, v8, a4, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    vminu.vx v8, v8, a3
+; CHECK-NEXT:    vminu.vx v16, v16, a3
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    sub a3, a0, a1
-; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
-; CHECK-ZVBB-NEXT:    addi a2, a2, -1
-; CHECK-ZVBB-NEXT:    and a2, a2, a3
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v16, v16, v0.t
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB46_2:
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
+; CHECK-ZVBB-NEXT:    vclz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -1248,49 +1174,27 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    fsrmi a4, 1
-; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a5, a0, a3
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a5, a5, a3
-; CHECK-NEXT:    li a3, 1086
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a4
-; CHECK-NEXT:    vsrl.vx v16, v16, a2
-; CHECK-NEXT:    vrsub.vx v16, v16, a3
-; CHECK-NEXT:    li a4, 64
-; CHECK-NEXT:    vminu.vx v16, v16, a4
-; CHECK-NEXT:    bltu a0, a1, .LBB47_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB47_2:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a2
-; CHECK-NEXT:    vrsub.vx v8, v8, a3
-; CHECK-NEXT:    vminu.vx v8, v8, a4
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    vminu.vx v8, v8, a3
+; CHECK-NEXT:    vminu.vx v16, v16, a3
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v16, v16
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB47_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB47_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
+; CHECK-ZVBB-NEXT:    vclz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -1299,23 +1203,20 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
-; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i8> %v
@@ -1324,7 +1225,7 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 23
@@ -1336,7 +1237,7 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8_unmasked(<vscale x 1 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.ctlz.nxv1i8(<vscale x 1 x i8> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -1346,23 +1247,20 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8_unmasked(<vscale x 1 x i8> %
 define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
-; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i8> %v
@@ -1371,7 +1269,7 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 23
@@ -1383,7 +1281,7 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8_unmasked(<vscale x 2 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.ctlz.nxv2i8(<vscale x 2 x i8> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1393,23 +1291,20 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8_unmasked(<vscale x 2 x i8> %
 define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i8> %v
@@ -1418,7 +1313,7 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 23
@@ -1430,7 +1325,7 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.ctlz.nxv4i8(<vscale x 4 x i8> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1440,23 +1335,20 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %
 define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
-; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i8> %v
@@ -1465,7 +1357,7 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 23
@@ -1477,7 +1369,7 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.ctlz.nxv8i8(<vscale x 8 x i8> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1487,23 +1379,20 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %
 define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    li a0, 134
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i8> %v
@@ -1512,7 +1401,7 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v16, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 23
@@ -1524,7 +1413,7 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.ctlz.nxv16i8(<vscale x 16 x i8> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1534,32 +1423,32 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8
 define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 2
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv32i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.ctlz.nxv32i8(<vscale x 32 x i8> %va, i1 true, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i8> %v
@@ -1568,7 +1457,7 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -1592,7 +1481,7 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv32i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.ctlz.nxv32i8(<vscale x 32 x i8> %va, i1 true, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1602,32 +1491,32 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 2
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv64i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.ctlz.nxv64i8(<vscale x 64 x i8> %va, i1 true, <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i8> %v
@@ -1636,7 +1525,7 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv64i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -1660,7 +1549,7 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv64i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.ctlz.nxv64i8(<vscale x 64 x i8> %va, i1 true, <vscale x 64 x i1> splat (i1 true), i32 %evl)
@@ -1670,20 +1559,17 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i16> %v
@@ -1692,7 +1578,7 @@ define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <v
 define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
@@ -1701,7 +1587,7 @@ define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16_unmasked(<vscale x 1 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.ctlz.nxv1i16(<vscale x 1 x i16> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -1711,20 +1597,17 @@ define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16_unmasked(<vscale x 1 x i16
 define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i16> %v
@@ -1733,7 +1616,7 @@ define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <v
 define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
@@ -1742,7 +1625,7 @@ define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16_unmasked(<vscale x 2 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.ctlz.nxv2i16(<vscale x 2 x i16> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1752,20 +1635,17 @@ define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16_unmasked(<vscale x 2 x i16
 define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i16> %v
@@ -1774,7 +1654,7 @@ define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    li a0, 142
@@ -1783,7 +1663,7 @@ define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.ctlz.nxv4i16(<vscale x 4 x i16> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1793,20 +1673,17 @@ define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16
 define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i16> %v
@@ -1815,7 +1692,7 @@ define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    li a0, 142
@@ -1824,7 +1701,7 @@ define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.ctlz.nxv8i16(<vscale x 8 x i16> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1834,20 +1711,17 @@ define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16
 define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i16> %v
@@ -1856,7 +1730,7 @@ define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    li a0, 142
@@ -1865,7 +1739,7 @@ define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16_unmasked(<vscale x 16 x
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.ctlz.nxv16i16(<vscale x 16 x i16> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1875,41 +1749,41 @@ define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16_unmasked(<vscale x 16 x
 define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 2
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 8
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv32i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 true, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -1918,7 +1792,7 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -1951,7 +1825,7 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv32i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.ctlz.nxv32i16(<vscale x 32 x i16> %va, i1 true, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1961,21 +1835,18 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i32> %v
@@ -1984,7 +1855,7 @@ define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <v
 define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v9, a0
@@ -1994,7 +1865,7 @@ define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.ctlz.nxv1i32(<vscale x 1 x i32> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -2004,21 +1875,18 @@ define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32
 define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v10, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i32> %v
@@ -2027,7 +1895,7 @@ define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v10, a0
@@ -2037,7 +1905,7 @@ define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.ctlz.nxv2i32(<vscale x 2 x i32> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -2047,21 +1915,18 @@ define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v12, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i32> %v
@@ -2070,7 +1935,7 @@ define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v12, a0
@@ -2080,7 +1945,7 @@ define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.ctlz.nxv4i32(<vscale x 4 x i32> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -2090,21 +1955,18 @@ define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vnsrl.wx v8, v16, a0
 ; CHECK-NEXT:    li a0, 1054
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i32> %v
@@ -2113,7 +1975,7 @@ define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v16, a0
@@ -2123,7 +1985,7 @@ define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.ctlz.nxv8i32(<vscale x 8 x i32> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -2133,19 +1995,19 @@ define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 158
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-NEXT:    li a1, 158
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i32> %v
@@ -2154,18 +2016,18 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-NEXT:    li a0, 158
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 158
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -2175,20 +2037,20 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i64> %v
@@ -2197,19 +2059,19 @@ define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -2219,20 +2081,20 @@ define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -2241,19 +2103,19 @@ define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -2263,20 +2125,20 @@ define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %v
@@ -2285,19 +2147,19 @@ define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -2307,20 +2169,20 @@ define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv7i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv7i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.ctlz.nxv7i64(<vscale x 7 x i64> %va, i1 true, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x i64> %v
@@ -2329,19 +2191,19 @@ define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.ctlz.nxv7i64(<vscale x 7 x i64> %va, i1 true, <vscale x 7 x i1> splat (i1 true), i32 %evl)
@@ -2351,20 +2213,20 @@ define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %v
@@ -2373,19 +2235,19 @@ define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -2395,56 +2257,24 @@ define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a4
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a4, a2, a4
-; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    li a3, 1086
-; CHECK-NEXT:    vrsub.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB94_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB94_2:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    sub a3, a0, a1
-; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
-; CHECK-ZVBB-NEXT:    addi a2, a2, -1
-; CHECK-ZVBB-NEXT:    and a2, a2, a3
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v16, v16, v0.t
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB94_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB94_2:
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
+; CHECK-ZVBB-NEXT:    vclz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -2453,46 +2283,24 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a2
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a4, a4, a2
-; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    vsrl.vx v16, v16, a2
-; CHECK-NEXT:    li a3, 1086
-; CHECK-NEXT:    vrsub.vx v16, v16, a3
-; CHECK-NEXT:    bltu a0, a1, .LBB95_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB95_2:
-; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a2
-; CHECK-NEXT:    vrsub.vx v8, v8, a3
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vclz.v v16, v16
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB95_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB95_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8
+; CHECK-ZVBB-NEXT:    vclz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -2502,28 +2310,25 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    li a0, 511
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vi v8, v8, -7, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vi v8, v8, -7
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    vadd.vi v8, v8, -7, v0.t
+; CHECK-ZVBB-NEXT:    li a0, 511
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
+; CHECK-ZVBB-NEXT:    vadd.vi v8, v8, -7
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i9> %v
@@ -2531,22 +2336,19 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 7
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i9> %v
@@ -2554,24 +2356,21 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlo_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 7
+; CHECK-NEXT:    vnot.v v8, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7
+; CHECK-ZVBB-NEXT:    vnot.v v8, v8
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
@@ -2580,26 +2379,23 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 511
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vxor.vx v8, v8, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 7
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vxor.vx v8, v8, a1
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    li a0, 511
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vxor.vx v8, v8, a0
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
@@ -2609,24 +2405,29 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vi v8, v8, -7
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
 ; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    li a1, 511
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
+; CHECK-ZVBB-NEXT:    vadd.vi v8, v8, -7
 ; CHECK-ZVBB-NEXT:    ret
   %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
@@ -2639,14 +2440,12 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 7
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
-; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
@@ -2654,8 +2453,9 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va
 ; CHECK-ZVBB-NEXT:    li a1, 511
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vxor.vx v8, v8, a1, v0.t
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 8b236a8594e03..a734377f687a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -11,25 +11,25 @@
 define <vscale x 1 x i8> @vp_ctpop_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.ctpop.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i8> %v
@@ -38,7 +38,7 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i
 define <vscale x 1 x i8> @vp_ctpop_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -55,7 +55,7 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 ze
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.ctpop.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -65,25 +65,25 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 ze
 define <vscale x 2 x i8> @vp_ctpop_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.ctpop.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i8> %v
@@ -92,7 +92,7 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i
 define <vscale x 2 x i8> @vp_ctpop_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -109,7 +109,7 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 ze
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.ctpop.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -119,25 +119,25 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 ze
 define <vscale x 4 x i8> @vp_ctpop_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.ctpop.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i8> %v
@@ -146,7 +146,7 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i
 define <vscale x 4 x i8> @vp_ctpop_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -163,7 +163,7 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 ze
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.ctpop.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -173,25 +173,25 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 ze
 define <vscale x 8 x i8> @vp_ctpop_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.ctpop.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i8> %v
@@ -200,7 +200,7 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i
 define <vscale x 8 x i8> @vp_ctpop_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -217,7 +217,7 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 ze
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.ctpop.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -227,25 +227,25 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 ze
 define <vscale x 16 x i8> @vp_ctpop_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.ctpop.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i8> %v
@@ -254,7 +254,7 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 define <vscale x 16 x i8> @vp_ctpop_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0
@@ -271,7 +271,7 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.ctpop.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -281,25 +281,25 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 define <vscale x 32 x i8> @vp_ctpop_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv32i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.ctpop.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i8> %v
@@ -308,7 +308,7 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 define <vscale x 32 x i8> @vp_ctpop_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv32i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
@@ -325,7 +325,7 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv32i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.ctpop.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -335,25 +335,25 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 define <vscale x 64 x i8> @vp_ctpop_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv64i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.ctpop.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i8> %v
@@ -362,7 +362,7 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 define <vscale x 64 x i8> @vp_ctpop_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv64i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
@@ -379,7 +379,7 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv64i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.ctpop.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> splat (i1 true), i32 %evl)
@@ -389,32 +389,32 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 define <vscale x 1 x i16> @vp_ctpop_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.ctpop.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i16> %v
@@ -423,7 +423,7 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1
 define <vscale x 1 x i16> @vp_ctpop_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -447,7 +447,7 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.ctpop.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -457,32 +457,32 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 define <vscale x 2 x i16> @vp_ctpop_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.ctpop.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i16> %v
@@ -491,7 +491,7 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2
 define <vscale x 2 x i16> @vp_ctpop_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -515,7 +515,7 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.ctpop.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -525,32 +525,32 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 define <vscale x 4 x i16> @vp_ctpop_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.ctpop.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i16> %v
@@ -559,7 +559,7 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4
 define <vscale x 4 x i16> @vp_ctpop_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -583,7 +583,7 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.ctpop.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -593,32 +593,32 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 define <vscale x 8 x i16> @vp_ctpop_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.ctpop.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i16> %v
@@ -627,7 +627,7 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8
 define <vscale x 8 x i16> @vp_ctpop_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -651,7 +651,7 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.ctpop.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -661,32 +661,32 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 define <vscale x 16 x i16> @vp_ctpop_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.ctpop.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i16> %v
@@ -695,7 +695,7 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 define <vscale x 16 x i16> @vp_ctpop_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -719,7 +719,7 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16_unmasked(<vscale x 16 x i16> %va,
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.ctpop.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -729,32 +729,32 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16_unmasked(<vscale x 16 x i16> %va,
 define <vscale x 32 x i16> @vp_ctpop_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv32i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.ctpop.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -763,7 +763,7 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 define <vscale x 32 x i16> @vp_ctpop_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv32i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -787,7 +787,7 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16_unmasked(<vscale x 32 x i16> %va,
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv32i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.ctpop.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -797,33 +797,33 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16_unmasked(<vscale x 32 x i16> %va,
 define <vscale x 1 x i32> @vp_ctpop_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.ctpop.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i32> %v
@@ -832,7 +832,7 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1
 define <vscale x 1 x i32> @vp_ctpop_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -857,7 +857,7 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.ctpop.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -867,33 +867,33 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 define <vscale x 2 x i32> @vp_ctpop_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.ctpop.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i32> %v
@@ -902,7 +902,7 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2
 define <vscale x 2 x i32> @vp_ctpop_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -927,7 +927,7 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.ctpop.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -937,33 +937,33 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 define <vscale x 4 x i32> @vp_ctpop_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.ctpop.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i32> %v
@@ -972,7 +972,7 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4
 define <vscale x 4 x i32> @vp_ctpop_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -997,7 +997,7 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.ctpop.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1007,33 +1007,33 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 define <vscale x 8 x i32> @vp_ctpop_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.ctpop.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i32> %v
@@ -1042,7 +1042,7 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8
 define <vscale x 8 x i32> @vp_ctpop_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -1067,7 +1067,7 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.ctpop.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1077,33 +1077,33 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 define <vscale x 16 x i32> @vp_ctpop_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.ctpop.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i32> %v
@@ -1112,7 +1112,7 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 define <vscale x 16 x i32> @vp_ctpop_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -1137,7 +1137,7 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.ctpop.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1147,80 +1147,80 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmul.vv v8, v8, v9
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 1
+; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vsub.vv v8, v8, v9
+; RV64-NEXT:    vand.vx v9, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v9, v8
+; RV64-NEXT:    vsrl.vi v9, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v9
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.ctpop.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i64> %v
@@ -1229,36 +1229,36 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    vand.vv v9, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
@@ -1267,41 +1267,41 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a2
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.ctpop.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -1311,80 +1311,80 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.ctpop.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -1393,36 +1393,36 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v10, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
@@ -1431,41 +1431,41 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.ctpop.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1475,80 +1475,80 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v12, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v12
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vsub.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v12, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.ctpop.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %v
@@ -1557,36 +1557,36 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v12, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
@@ -1595,41 +1595,41 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a2
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.ctpop.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1639,80 +1639,80 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv7i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.ctpop.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x i64> %v
@@ -1721,36 +1721,36 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
@@ -1759,41 +1759,41 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.ctpop.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> splat (i1 true), i32 %evl)
@@ -1803,80 +1803,80 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.ctpop.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %v
@@ -1885,36 +1885,36 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
@@ -1923,41 +1923,41 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.ctpop.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1969,131 +1969,82 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vmv1r.v v7, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    srli a3, a1, 3
-; RV32-NEXT:    vslidedown.vx v0, v0, a3
-; RV32-NEXT:    sub a3, a0, a1
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    sltu a2, a0, a3
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a2, a2, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    bltu a0, a1, .LBB46_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB46_2:
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 1
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2102,85 +2053,55 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a2, a1, 3
-; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v7, v0, a2
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB46_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:  .LBB46_2:
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a4, a4, -241
-; RV64-NEXT:    addi a5, a5, 257
-; RV64-NEXT:    slli a6, a2, 32
-; RV64-NEXT:    add a2, a2, a6
-; RV64-NEXT:    slli a6, a3, 32
-; RV64-NEXT:    add a3, a3, a6
-; RV64-NEXT:    slli a6, a4, 32
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsub.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v16, 1
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsub.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vand.vx v24, v16, a1
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v8, v8, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a4
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    sub a3, a0, a1
-; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
-; CHECK-ZVBB-NEXT:    addi a2, a2, -1
-; CHECK-ZVBB-NEXT:    and a2, a2, a3
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v16, v16, v0.t
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB46_2:
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
+; CHECK-ZVBB-NEXT:    vcpop.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -2191,103 +2112,82 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    sub a4, a0, a1
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    sltu a2, a0, a4
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a2, a2, a4
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v0, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v24, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v24, 4
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v16, v16, a2
-; RV32-NEXT:    bltu a0, a1, .LBB47_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB47_2:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 1
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2296,86 +2196,55 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB47_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:  .LBB47_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a3, a3, 1365
-; RV64-NEXT:    addi a4, a4, 819
-; RV64-NEXT:    addi a5, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a7, a3, 32
-; RV64-NEXT:    add a3, a3, a7
-; RV64-NEXT:    slli a7, a4, 32
-; RV64-NEXT:    add a4, a4, a7
-; RV64-NEXT:    slli a7, a5, 32
-; RV64-NEXT:    add a5, a5, a7
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    sub a2, a0, a2
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    vand.vx v24, v24, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vand.vx v24, v24, a0
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vand.vx v24, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a4
+; RV64-NEXT:    vand.vx v24, v16, a1
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a6
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v16, v16, a6
-; RV64-NEXT:    vsrl.vx v16, v16, a7
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v8, v8, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a4
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vcpop.v v16, v16
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB47_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB47_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
+; CHECK-ZVBB-NEXT:    vcpop.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.ctpop.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -2386,36 +2255,36 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    li a0, 511
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
-; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    li a0, 511
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVBB-NEXT:    vcpop.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.ctpop.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i9> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index c82ad17545a6a..9b2f220dd802e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -11,28 +11,25 @@
 define <vscale x 1 x i8> @vp_cttz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.cttz.nxv1i8(<vscale x 1 x i8> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i8> %v
@@ -41,27 +38,24 @@ define <vscale x 1 x i8> @vp_cttz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 define <vscale x 1 x i8> @vp_cttz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.cttz.nxv1i8(<vscale x 1 x i8> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -71,28 +65,25 @@ define <vscale x 1 x i8> @vp_cttz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 define <vscale x 2 x i8> @vp_cttz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.cttz.nxv2i8(<vscale x 2 x i8> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i8> %v
@@ -101,27 +92,24 @@ define <vscale x 2 x i8> @vp_cttz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 define <vscale x 2 x i8> @vp_cttz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.cttz.nxv2i8(<vscale x 2 x i8> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -131,28 +119,25 @@ define <vscale x 2 x i8> @vp_cttz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 define <vscale x 4 x i8> @vp_cttz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v12
+; CHECK-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.cttz.nxv4i8(<vscale x 4 x i8> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i8> %v
@@ -161,27 +146,24 @@ define <vscale x 4 x i8> @vp_cttz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 define <vscale x 4 x i8> @vp_cttz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v12
+; CHECK-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.cttz.nxv4i8(<vscale x 4 x i8> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -191,28 +173,25 @@ define <vscale x 4 x i8> @vp_cttz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 define <vscale x 8 x i8> @vp_cttz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.cttz.nxv8i8(<vscale x 8 x i8> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i8> %v
@@ -221,27 +200,24 @@ define <vscale x 8 x i8> @vp_cttz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 define <vscale x 8 x i8> @vp_cttz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.cttz.nxv8i8(<vscale x 8 x i8> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -251,28 +227,25 @@ define <vscale x 8 x i8> @vp_cttz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 define <vscale x 16 x i8> @vp_cttz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vnsrl.wi v12, v16, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.cttz.nxv16i8(<vscale x 16 x i8> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i8> %v
@@ -281,27 +254,24 @@ define <vscale x 16 x i8> @vp_cttz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 define <vscale x 16 x i8> @vp_cttz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vnsrl.wi v12, v16, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.cttz.nxv16i8(<vscale x 16 x i8> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -311,28 +281,28 @@ define <vscale x 16 x i8> @vp_cttz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 define <vscale x 32 x i8> @vp_cttz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vadd.vi v12, v8, -1
+; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv32i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.cttz.nxv32i8(<vscale x 32 x i8> %va, i1 false, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i8> %v
@@ -341,7 +311,7 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 define <vscale x 32 x i8> @vp_cttz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv32i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vadd.vi v12, v8, -1
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
@@ -361,7 +331,7 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv32i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.cttz.nxv32i8(<vscale x 32 x i8> %va, i1 false, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -371,28 +341,28 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 define <vscale x 64 x i8> @vp_cttz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v8, -1
+; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv64i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.cttz.nxv64i8(<vscale x 64 x i8> %va, i1 false, <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i8> %v
@@ -401,7 +371,7 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 define <vscale x 64 x i8> @vp_cttz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv64i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v8, -1
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
@@ -421,7 +391,7 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv64i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.cttz.nxv64i8(<vscale x 64 x i8> %va, i1 false, <vscale x 64 x i1> splat (i1 true), i32 %evl)
@@ -431,35 +401,22 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 define <vscale x 1 x i16> @vp_cttz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i16> %v
@@ -468,34 +425,21 @@ define <vscale x 1 x i16> @vp_cttz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 define <vscale x 1 x i16> @vp_cttz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -505,35 +449,22 @@ define <vscale x 1 x i16> @vp_cttz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 define <vscale x 2 x i16> @vp_cttz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i16> %v
@@ -542,34 +473,21 @@ define <vscale x 2 x i16> @vp_cttz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 define <vscale x 2 x i16> @vp_cttz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -579,35 +497,22 @@ define <vscale x 2 x i16> @vp_cttz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 define <vscale x 4 x i16> @vp_cttz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i16> %v
@@ -616,34 +521,21 @@ define <vscale x 4 x i16> @vp_cttz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 define <vscale x 4 x i16> @vp_cttz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -653,35 +545,22 @@ define <vscale x 4 x i16> @vp_cttz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 define <vscale x 8 x i16> @vp_cttz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i16> %v
@@ -690,34 +569,21 @@ define <vscale x 8 x i16> @vp_cttz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 define <vscale x 8 x i16> @vp_cttz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -727,35 +593,22 @@ define <vscale x 8 x i16> @vp_cttz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 define <vscale x 16 x i16> @vp_cttz_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v16, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i16> %v
@@ -764,34 +617,21 @@ define <vscale x 16 x i16> @vp_cttz_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 define <vscale x 16 x i16> @vp_cttz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v16, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -801,35 +641,35 @@ define <vscale x 16 x i16> @vp_cttz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 define <vscale x 32 x i16> @vp_cttz_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v8, -1
+; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv32i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.cttz.nxv32i16(<vscale x 32 x i16> %va, i1 false, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -838,7 +678,7 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 define <vscale x 32 x i16> @vp_cttz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv32i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v8, -1
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    lui a0, 5
@@ -865,7 +705,7 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv32i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.cttz.nxv32i16(<vscale x 32 x i16> %va, i1 false, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -875,36 +715,23 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 define <vscale x 1 x i32> @vp_cttz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i32> %v
@@ -913,35 +740,22 @@ define <vscale x 1 x i32> @vp_cttz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 define <vscale x 1 x i32> @vp_cttz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -951,36 +765,23 @@ define <vscale x 1 x i32> @vp_cttz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 define <vscale x 2 x i32> @vp_cttz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i32> %v
@@ -989,35 +790,22 @@ define <vscale x 2 x i32> @vp_cttz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 define <vscale x 2 x i32> @vp_cttz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1027,36 +815,23 @@ define <vscale x 2 x i32> @vp_cttz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 define <vscale x 4 x i32> @vp_cttz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wx v10, v12, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i32> %v
@@ -1065,35 +840,22 @@ define <vscale x 4 x i32> @vp_cttz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 define <vscale x 4 x i32> @vp_cttz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wx v10, v12, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1103,36 +865,23 @@ define <vscale x 4 x i32> @vp_cttz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 define <vscale x 8 x i32> @vp_cttz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vnsrl.wx v12, v16, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i32> %v
@@ -1141,35 +890,22 @@ define <vscale x 8 x i32> @vp_cttz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 define <vscale x 8 x i32> @vp_cttz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vnsrl.wx v12, v16, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1179,36 +915,24 @@ define <vscale x 8 x i32> @vp_cttz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 define <vscale x 16 x i32> @vp_cttz_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i32> %v
@@ -1217,35 +941,23 @@ define <vscale x 16 x i32> @vp_cttz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 define <vscale x 16 x i32> @vp_cttz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v16, v8
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1253,175 +965,53 @@ define <vscale x 16 x i32> @vp_cttz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 }
 
 define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vi v9, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vadd.vi v9, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v9, v9
+; CHECK-NEXT:    vsrl.vx v9, v9, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i64> %v
 }
 
 define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv1i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vi v9, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv1i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vadd.vi v9, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v9, v9
+; CHECK-NEXT:    vsrl.vx v9, v9, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 false, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -1429,175 +1019,53 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 }
 
 define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vi v10, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vadd.vi v10, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v10, v10
+; CHECK-NEXT:    vsrl.vx v10, v10, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 false, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
 define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv2i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vi v10, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv2i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vadd.vi v10, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v10, v10
+; CHECK-NEXT:    vsrl.vx v10, v10, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv2i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 false, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -1605,175 +1073,53 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 }
 
 define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vi v12, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vadd.vi v12, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v12, v12
+; CHECK-NEXT:    vsrl.vx v12, v12, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 false, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %v
 }
 
 define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv4i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vi v12, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv4i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vadd.vi v12, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a1
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v12, v12
+; CHECK-NEXT:    vsrl.vx v12, v12, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv4i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 false, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -1781,175 +1127,53 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 }
 
 define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv7i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv7i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv7i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.cttz.nxv7i64(<vscale x 7 x i64> %va, i1 false, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x i64> %v
 }
 
 define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv7i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv7i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv7i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.cttz.nxv7i64(<vscale x 7 x i64> %va, i1 false, <vscale x 7 x i1> splat (i1 true), i32 %evl)
@@ -1957,175 +1181,53 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 }
 
 define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 false, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %v
 }
 
 define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv8i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv8i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 false, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -2133,431 +1235,72 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 }
 
 define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vmv1r.v v7, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a2, a1, 3
-; RV32-NEXT:    sub a3, a0, a1
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sltu a2, a0, a3
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a2, a2, a3
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v8, v16, -1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    bltu a0, a1, .LBB46_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB46_2:
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vmv1r.v v7, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    srli a6, a1, 3
-; RV64-NEXT:    sub a7, a0, a1
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a4, a4, -241
-; RV64-NEXT:    addi t0, a5, 257
-; RV64-NEXT:    vslidedown.vx v0, v0, a6
-; RV64-NEXT:    slli a6, a2, 32
-; RV64-NEXT:    add a6, a2, a6
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a2, a4, 32
-; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    slli a3, t0, 32
-; RV64-NEXT:    add a3, t0, a3
-; RV64-NEXT:    sltu a4, a0, a7
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a7, a4, a7
-; RV64-NEXT:    li a4, 56
-; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a5, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a4, v0.t
-; RV64-NEXT:    bltu a0, a1, .LBB46_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB46_2:
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a4, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v24, v16, v24
+; CHECK-NEXT:    vmseq.vi v7, v16, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v24
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
+; CHECK-NEXT:    vmerge.vxm v8, v8, a3, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vxm v16, v16, a3, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    sub a3, a0, a1
-; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
-; CHECK-ZVBB-NEXT:    addi a2, a2, -1
-; CHECK-ZVBB-NEXT:    and a2, a2, a3
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v16, v16, v0.t
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB46_2:
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
+; CHECK-ZVBB-NEXT:    vctz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
 }
 
 define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_nxv16i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    sub a4, a0, a1
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    sltu a2, a0, a4
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a2, a2, a4
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v16, -1
-; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v24, 1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v0, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsub.vv v24, v24, v16
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v24, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v24, 4
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v16, v16, a2
-; RV32-NEXT:    bltu a0, a1, .LBB47_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB47_2:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vx v8, v8, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_nxv16i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    sub a6, a0, a1
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a7, a4, -241
-; RV64-NEXT:    addi t0, a5, 257
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a5, a2, a5
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a7, 32
-; RV64-NEXT:    add a2, a7, a2
-; RV64-NEXT:    slli a3, t0, 32
-; RV64-NEXT:    add a3, t0, a3
-; RV64-NEXT:    sltu a7, a0, a6
-; RV64-NEXT:    addi a7, a7, -1
-; RV64-NEXT:    and a6, a7, a6
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v16, -1
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vand.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a5
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a4
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vmul.vx v16, v16, a3
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v16, v16, a6
-; RV64-NEXT:    bltu a0, a1, .LBB47_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB47_2:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a5
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v24, v8, a4
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_nxv16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v24, v16, v24
+; CHECK-NEXT:    vmseq.vi v7, v16, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v24
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
+; CHECK-NEXT:    vmerge.vxm v8, v8, a3, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vxm v16, v16, a3, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v16, v16
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB47_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB47_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
+; CHECK-ZVBB-NEXT:    vctz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -2566,26 +1309,23 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.cttz.nxv1i8(<vscale x 1 x i8> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i8> %v
@@ -2594,7 +1334,7 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -2609,7 +1349,7 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8_unmasked(<vscale x 1 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i8> @llvm.vp.cttz.nxv1i8(<vscale x 1 x i8> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -2619,26 +1359,23 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8_unmasked(<vscale x 1 x i8> %
 define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.cttz.nxv2i8(<vscale x 2 x i8> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i8> %v
@@ -2647,7 +1384,7 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -2662,7 +1399,7 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8_unmasked(<vscale x 2 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i8> @llvm.vp.cttz.nxv2i8(<vscale x 2 x i8> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -2672,26 +1409,23 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8_unmasked(<vscale x 2 x i8> %
 define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.cttz.nxv4i8(<vscale x 4 x i8> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i8> %v
@@ -2700,7 +1434,7 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -2715,7 +1449,7 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vp.cttz.nxv4i8(<vscale x 4 x i8> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -2725,26 +1459,23 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8_unmasked(<vscale x 4 x i8> %
 define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.cttz.nxv8i8(<vscale x 8 x i8> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i8> %v
@@ -2753,7 +1484,7 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -2768,7 +1499,7 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i8> @llvm.vp.cttz.nxv8i8(<vscale x 8 x i8> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -2778,26 +1509,23 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8_unmasked(<vscale x 8 x i8> %
 define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vzext.vf2 v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.cttz.nxv16i8(<vscale x 16 x i8> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i8> %v
@@ -2806,7 +1534,7 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -2821,7 +1549,7 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vp.cttz.nxv16i8(<vscale x 16 x i8> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -2831,28 +1559,28 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8_unmasked(<vscale x 16 x i8
 define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vadd.vi v12, v8, -1
+; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv32i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.cttz.nxv32i8(<vscale x 32 x i8> %va, i1 true, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i8> %v
@@ -2861,7 +1589,7 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv32i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vadd.vi v12, v8, -1
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
@@ -2881,7 +1609,7 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv32i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i8> @llvm.vp.cttz.nxv32i8(<vscale x 32 x i8> %va, i1 true, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -2891,28 +1619,28 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v8, -1
+; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv64i8:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.cttz.nxv64i8(<vscale x 64 x i8> %va, i1 true, <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i8> %v
@@ -2921,7 +1649,7 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv64i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v8, -1
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    li a0, 85
@@ -2941,7 +1669,7 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv64i8_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 64 x i8> @llvm.vp.cttz.nxv64i8(<vscale x 64 x i8> %va, i1 true, <vscale x 64 x i1> splat (i1 true), i32 %evl)
@@ -2951,22 +1679,19 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i16> %v
@@ -2975,7 +1700,7 @@ define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <v
 define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
@@ -2986,7 +1711,7 @@ define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16_unmasked(<vscale x 1 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.cttz.nxv1i16(<vscale x 1 x i16> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -2996,22 +1721,19 @@ define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16_unmasked(<vscale x 1 x i16
 define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i16> %v
@@ -3020,7 +1742,7 @@ define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <v
 define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
@@ -3031,7 +1753,7 @@ define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16_unmasked(<vscale x 2 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.cttz.nxv2i16(<vscale x 2 x i16> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -3041,22 +1763,19 @@ define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16_unmasked(<vscale x 2 x i16
 define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i16> %v
@@ -3065,7 +1784,7 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v10, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
@@ -3076,7 +1795,7 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.cttz.nxv4i16(<vscale x 4 x i16> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -3086,22 +1805,19 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16_unmasked(<vscale x 4 x i16
 define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i16> %v
@@ -3110,7 +1826,7 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    vand.vv v12, v8, v10
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
@@ -3121,7 +1837,7 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.cttz.nxv8i16(<vscale x 8 x i16> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -3131,22 +1847,19 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16_unmasked(<vscale x 8 x i16
 define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wi v16, v8, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i16> %v
@@ -3155,7 +1868,7 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    vand.vv v16, v8, v12
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
@@ -3166,7 +1879,7 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16_unmasked(<vscale x 16 x
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.cttz.nxv16i16(<vscale x 16 x i16> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -3176,35 +1889,35 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16_unmasked(<vscale x 16 x
 define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v8, -1
+; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v16, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv32i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.cttz.nxv32i16(<vscale x 32 x i16> %va, i1 true, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -3213,7 +1926,7 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv32i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v8, -1
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    lui a0, 5
@@ -3240,7 +1953,7 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv32i16_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.cttz.nxv32i16(<vscale x 32 x i16> %va, i1 true, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -3250,23 +1963,20 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i32> %v
@@ -3275,7 +1985,7 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <v
 define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
@@ -3287,7 +1997,7 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.cttz.nxv1i32(<vscale x 1 x i32> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -3297,23 +2007,20 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32
 define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wx v10, v8, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i32> %v
@@ -3322,7 +2029,7 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v10, v8, v9
@@ -3334,7 +2041,7 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.cttz.nxv2i32(<vscale x 2 x i32> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -3344,23 +2051,20 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wx v12, v8, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i32> %v
@@ -3369,7 +2073,7 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v12, v8, v10
@@ -3381,7 +2085,7 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.cttz.nxv4i32(<vscale x 4 x i32> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -3391,23 +2095,20 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    li a0, 52
-; CHECK-NEXT:    vand.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wx v16, v8, a0
 ; CHECK-NEXT:    li a0, 1023
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i32> %v
@@ -3416,7 +2117,7 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v16, v8, v12
@@ -3428,7 +2129,7 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.cttz.nxv8i32(<vscale x 8 x i32> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -3438,21 +2139,21 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i32> %v
@@ -3461,7 +2162,7 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vand.vv v8, v8, v16
@@ -3474,7 +2175,7 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i32_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -3484,22 +2185,22 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i64> %v
@@ -3508,7 +2209,7 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv1i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
@@ -3522,7 +2223,7 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 true, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -3532,22 +2233,22 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 true, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
@@ -3556,7 +2257,7 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
@@ -3570,7 +2271,7 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 true, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -3580,22 +2281,22 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 true, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %v
@@ -3604,7 +2305,7 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv4i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
@@ -3618,7 +2319,7 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 true, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -3628,22 +2329,22 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv7i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv7i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.cttz.nxv7i64(<vscale x 7 x i64> %va, i1 true, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x i64> %v
@@ -3652,7 +2353,7 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv7i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
@@ -3666,7 +2367,7 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv7i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.cttz.nxv7i64(<vscale x 7 x i64> %va, i1 true, <vscale x 7 x i1> splat (i1 true), i32 %evl)
@@ -3676,22 +2377,22 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 true, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %v
@@ -3700,7 +2401,7 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv8i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    li a1, 52
@@ -3714,7 +2415,7 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 true, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -3724,60 +2425,28 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a4
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a4, a2, a4
-; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
-; CHECK-NEXT:    vand.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    li a3, 1023
-; CHECK-NEXT:    vsub.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB94_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB94_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vand.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v16, v16, v24
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    sub a3, a0, a1
-; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
-; CHECK-ZVBB-NEXT:    addi a2, a2, -1
-; CHECK-ZVBB-NEXT:    and a2, a2, a3
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v16, v16, v0.t
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB94_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB94_2:
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
+; CHECK-ZVBB-NEXT:    vctz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> %va, i1 true, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -3786,50 +2455,28 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a2
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a4, a4, a2
-; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v24, v16, 0
-; CHECK-NEXT:    vand.vv v16, v16, v24
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    vsrl.vx v16, v16, a2
-; CHECK-NEXT:    li a3, 1023
-; CHECK-NEXT:    vsub.vx v16, v16, a3
-; CHECK-NEXT:    bltu a0, a1, .LBB95_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB95_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v24
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v16, v16, v24
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a2
-; CHECK-NEXT:    vsub.vx v8, v8, a3
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v16, v16
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB95_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB95_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v8, v8
+; CHECK-ZVBB-NEXT:    vctz.v v16, v16
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> %va, i1 true, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x i64> %v
@@ -3839,26 +2486,23 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 define <vscale x 1 x i9> @vp_cttz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 512
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vor.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vor.vx v8, v8, a0
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 512
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vor.vx v8, v8, a1, v0.t
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    li a0, 512
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vor.vx v8, v8, a0
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.cttz.nxv1i9(<vscale x 1 x i9> %va, i1 false, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i9> %v
@@ -3866,23 +2510,23 @@ define <vscale x 1 x i9> @vp_cttz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 define <vscale x 1 x i9> @vp_zero_undef_cttz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_zero_undef_cttz_nxv1i9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_zero_undef_cttz_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vctz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vctz.v v8, v8
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.cttz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i9> %v
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 00c36cb7f7327..31322e4bc9201 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -7,26 +7,16 @@
 define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i8> %v
@@ -35,26 +25,16 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i8> %v
@@ -63,26 +43,16 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i8> %v
@@ -91,26 +61,16 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i8> %v
@@ -119,26 +79,16 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i8> %v
@@ -147,26 +97,16 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i8> %v
@@ -175,26 +115,16 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i8> %v
@@ -203,26 +133,16 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i8> %v
@@ -231,35 +151,13 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i16> %v
@@ -268,35 +166,13 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i16> %v
@@ -305,35 +181,13 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i16> %v
@@ -342,35 +196,13 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i16> %v
@@ -379,35 +211,13 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i16> %v
@@ -416,35 +226,13 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i16> %v
@@ -453,35 +241,13 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i16> %v
@@ -490,35 +256,13 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i16> %v
@@ -527,38 +271,14 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i32> %v
@@ -567,38 +287,14 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i32> %v
@@ -607,38 +303,14 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v10, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i32> %v
@@ -647,38 +319,14 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v10, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i32> %v
@@ -687,38 +335,14 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v12, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i32> %v
@@ -727,38 +351,14 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v12, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i32> %v
@@ -767,38 +367,14 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v16, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i32> %v
@@ -807,1574 +383,239 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_v16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v16, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i32> %v
 }
 
 define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vx v9, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %v
 }
 
 define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v2i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v2i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vx v9, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i64> %v
 }
 
 define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
 }
 
 define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v4i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v4i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vx v10, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
 }
 
 define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v12, v16, v12, v0.t
-; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %v
 }
 
 define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v8i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v8i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vx v12, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i64> %v
 }
 
 define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v15i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v15i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v15i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
 }
 
 define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v15i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v15i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vx v16, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v15i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
 }
 
 define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
 }
 
 define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v16i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v16i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vx v16, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
 }
 
 define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB34_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v7, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB34_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a6, a4, -241
-; RV64-NEXT:    addi a7, a5, 257
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a5, a2, a5
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a6, 32
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    slli a3, a7, 32
-; RV64-NEXT:    add a3, a7, a3
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 2, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 8, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 16, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vx v24, v16, a1, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a4, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    vminu.vx v8, v8, a3
+; CHECK-NEXT:    vminu.vx v16, v16, a3
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB35_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 1
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 2
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 8
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 16
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vor.vv v24, v16, v8
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v8, v24, v8
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB35_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB35_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a7, a3, 1365
-; RV64-NEXT:    addi a3, a4, 819
-; RV64-NEXT:    addi a4, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a5, a7, 32
-; RV64-NEXT:    add a7, a7, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a3, a4, 32
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    slli a4, a6, 32
-; RV64-NEXT:    add a4, a6, a4
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 16
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a7
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v16, a2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a5
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a7
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a5
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    vminu.vx v8, v8, a3
+; CHECK-NEXT:    vminu.vx v16, v16, a3
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -2382,26 +623,14 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i8> %v
@@ -2410,26 +639,14 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i8> %v
@@ -2438,26 +655,14 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i8> %v
@@ -2466,26 +671,14 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i8> %v
@@ -2494,26 +687,14 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i8> %v
@@ -2522,26 +703,14 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i8> %v
@@ -2550,26 +719,14 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i8> %v
@@ -2578,26 +735,14 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    li a0, 134
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i8> %v
@@ -2606,35 +751,11 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i16> %v
@@ -2643,35 +764,11 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i16> %v
@@ -2680,35 +777,11 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i16> %v
@@ -2717,35 +790,11 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i16> %v
@@ -2754,35 +803,11 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i16> %v
@@ -2791,35 +816,11 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i16> %v
@@ -2828,35 +829,11 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i16> %v
@@ -2865,35 +842,11 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i16> %v
@@ -2902,38 +855,12 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i32> %v
@@ -2942,38 +869,12 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i32> %v
@@ -2982,38 +883,12 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v10, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i32> %v
@@ -3022,38 +897,12 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v10, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i32> %v
@@ -3062,38 +911,12 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v12, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i32> %v
@@ -3102,38 +925,12 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v12, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i32> %v
@@ -3142,38 +939,12 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v16, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i32> %v
@@ -3182,1574 +953,214 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_v16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vnsrl.wx v8, v16, a0
+; CHECK-NEXT:    li a0, 1054
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i32> %v
 }
 
 define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vx v9, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %v
 }
 
 define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v2i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v2i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vx v9, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i64> %v
 }
 
 define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
 }
 
 define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vx v10, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
 }
 
 define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v12, v16, v12, v0.t
-; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %v
 }
 
 define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v8i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v8i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vx v12, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i64> %v
 }
 
 define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v15i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v15i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
 }
 
 define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vx v16, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
 }
 
 define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
 }
 
 define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    li a4, 32
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vx v16, v8, a4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
 }
 
 define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB70_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:  .LBB70_2:
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v7, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB70_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB70_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a6, a4, -241
-; RV64-NEXT:    addi a7, a5, 257
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a5, a2, a5
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a6, 32
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    slli a3, a7, 32
-; RV64-NEXT:    add a3, a7, a3
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 2, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 8, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 16, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vx v24, v16, a1, v0.t
-; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a4, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB71_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB71_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 1
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 2
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 8
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 16
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vor.vv v24, v16, v8
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v8, v24, v8
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB71_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB71_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a7, a3, 1365
-; RV64-NEXT:    addi a3, a4, 819
-; RV64-NEXT:    addi a4, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a5, a7, 32
-; RV64-NEXT:    add a7, a7, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a3, a4, 32
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    slli a4, a6, 32
-; RV64-NEXT:    add a4, a6, a4
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 16
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a7
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v16, a2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a5
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a7
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a5
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1086
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
+; CHECK-NEXT:    vrsub.vx v16, v16, a2
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index f56438bf87e6a..5aa2d55e2b06c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -7,19 +7,19 @@
 define <2 x i8> @vp_ctpop_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.ctpop.v2i8(<2 x i8> %va, <2 x i1> %m, i32 %evl)
   ret <2 x i8> %v
@@ -28,7 +28,7 @@ define <2 x i8> @vp_ctpop_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i8> @vp_ctpop_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -49,19 +49,19 @@ define <2 x i8> @vp_ctpop_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 define <4 x i8> @vp_ctpop_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.ctpop.v4i8(<4 x i8> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i8> %v
@@ -70,7 +70,7 @@ define <4 x i8> @vp_ctpop_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i8> @vp_ctpop_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -91,19 +91,19 @@ define <4 x i8> @vp_ctpop_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 define <8 x i8> @vp_ctpop_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.ctpop.v8i8(<8 x i8> %va, <8 x i1> %m, i32 %evl)
   ret <8 x i8> %v
@@ -112,7 +112,7 @@ define <8 x i8> @vp_ctpop_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i8> @vp_ctpop_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -133,19 +133,19 @@ define <8 x i8> @vp_ctpop_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 define <16 x i8> @vp_ctpop_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.ctpop.v16i8(<16 x i8> %va, <16 x i1> %m, i32 %evl)
   ret <16 x i8> %v
@@ -154,7 +154,7 @@ define <16 x i8> @vp_ctpop_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl)
 define <16 x i8> @vp_ctpop_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
@@ -175,26 +175,26 @@ define <16 x i8> @vp_ctpop_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 define <2 x i16> @vp_ctpop_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.ctpop.v2i16(<2 x i16> %va, <2 x i1> %m, i32 %evl)
   ret <2 x i16> %v
@@ -203,7 +203,7 @@ define <2 x i16> @vp_ctpop_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i16> @vp_ctpop_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -231,26 +231,26 @@ define <2 x i16> @vp_ctpop_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 define <4 x i16> @vp_ctpop_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.ctpop.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i16> %v
@@ -259,7 +259,7 @@ define <4 x i16> @vp_ctpop_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i16> @vp_ctpop_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -287,26 +287,26 @@ define <4 x i16> @vp_ctpop_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 define <8 x i16> @vp_ctpop_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.ctpop.v8i16(<8 x i16> %va, <8 x i1> %m, i32 %evl)
   ret <8 x i16> %v
@@ -315,7 +315,7 @@ define <8 x i16> @vp_ctpop_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i16> @vp_ctpop_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -343,26 +343,26 @@ define <8 x i16> @vp_ctpop_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 define <16 x i16> @vp_ctpop_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 8
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.ctpop.v16i16(<16 x i16> %va, <16 x i1> %m, i32 %evl)
   ret <16 x i16> %v
@@ -371,7 +371,7 @@ define <16 x i16> @vp_ctpop_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %ev
 define <16 x i16> @vp_ctpop_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -399,27 +399,27 @@ define <16 x i16> @vp_ctpop_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 define <2 x i32> @vp_ctpop_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.ctpop.v2i32(<2 x i32> %va, <2 x i1> %m, i32 %evl)
   ret <2 x i32> %v
@@ -428,7 +428,7 @@ define <2 x i32> @vp_ctpop_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i32> @vp_ctpop_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -457,27 +457,27 @@ define <2 x i32> @vp_ctpop_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v9, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i32> %v
@@ -486,7 +486,7 @@ define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i32> @vp_ctpop_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -515,27 +515,27 @@ define <4 x i32> @vp_ctpop_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 define <8 x i32> @vp_ctpop_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.ctpop.v8i32(<8 x i32> %va, <8 x i1> %m, i32 %evl)
   ret <8 x i32> %v
@@ -544,7 +544,7 @@ define <8 x i32> @vp_ctpop_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i32> @vp_ctpop_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -573,27 +573,27 @@ define <8 x i32> @vp_ctpop_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 define <16 x i32> @vp_ctpop_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v12, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.ctpop.v16i32(<16 x i32> %va, <16 x i1> %m, i32 %evl)
   ret <16 x i32> %v
@@ -602,7 +602,7 @@ define <16 x i32> @vp_ctpop_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %ev
 define <16 x i32> @vp_ctpop_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctpop_v16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -631,74 +631,74 @@ define <16 x i32> @vp_ctpop_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vmv.v.x v9, a0
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmul.vv v8, v8, v9
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 1
+; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vsub.vv v8, v8, v9
+; RV64-NEXT:    vand.vx v9, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v9, v8
+; RV64-NEXT:    vsrl.vi v9, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v9
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %v
@@ -707,36 +707,36 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    vand.vv v9, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a0
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
@@ -745,34 +745,34 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a2
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -783,74 +783,74 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
@@ -859,36 +859,36 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v10, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
@@ -897,34 +897,34 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -935,74 +935,74 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v12
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vsub.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v12, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %v
@@ -1011,36 +1011,36 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v12, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
@@ -1049,34 +1049,34 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a2
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1087,74 +1087,74 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v15i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
@@ -1163,36 +1163,36 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
@@ -1201,34 +1201,34 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1239,74 +1239,74 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
@@ -1315,36 +1315,36 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
@@ -1353,34 +1353,34 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1393,132 +1393,82 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB34_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 1
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1527,62 +1477,48 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v7, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB34_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    addi a5, a0, -16
-; RV64-NEXT:    sltu a0, a0, a5
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a5
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a2, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsub.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v16, 1
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vsub.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vand.vx v24, v16, a1
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v8, v8, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a4
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
@@ -1591,138 +1527,134 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB35_2:
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
 ; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v16, a2
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vsrl.vx v16, v16, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v32i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB35_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB35_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a4, a4, -241
-; RV64-NEXT:    addi a5, a5, 257
-; RV64-NEXT:    slli a6, a2, 32
-; RV64-NEXT:    add a2, a2, a6
-; RV64-NEXT:    slli a6, a3, 32
-; RV64-NEXT:    add a3, a3, a6
-; RV64-NEXT:    slli a6, a4, 32
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    vand.vx v24, v24, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a0
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vand.vx v24, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vand.vx v24, v16, a1
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v8, v8, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a4
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 098384d200045..0d54224154494 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -7,22 +7,19 @@
 define <2 x i8> @vp_cttz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i8> %v
@@ -31,22 +28,19 @@ define <2 x i8> @vp_cttz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i8> @vp_cttz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i8> %v
@@ -55,22 +49,19 @@ define <2 x i8> @vp_cttz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 define <4 x i8> @vp_cttz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i8> %v
@@ -79,22 +70,19 @@ define <4 x i8> @vp_cttz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i8> @vp_cttz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i8> %v
@@ -103,22 +91,19 @@ define <4 x i8> @vp_cttz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 define <8 x i8> @vp_cttz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v12
+; CHECK-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i8> %v
@@ -127,22 +112,19 @@ define <8 x i8> @vp_cttz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i8> @vp_cttz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v12
+; CHECK-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i8> %v
@@ -151,22 +133,19 @@ define <8 x i8> @vp_cttz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 define <16 x i8> @vp_cttz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i8> %v
@@ -175,22 +154,19 @@ define <16 x i8> @vp_cttz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 define <16 x i8> @vp_cttz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    vmerge.vim v8, v8, 8, v0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i8> %v
@@ -199,29 +175,16 @@ define <16 x i8> @vp_cttz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 define <2 x i16> @vp_cttz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i16> %v
@@ -230,29 +193,16 @@ define <2 x i16> @vp_cttz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i16> @vp_cttz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i16> %v
@@ -261,29 +211,16 @@ define <2 x i16> @vp_cttz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 define <4 x i16> @vp_cttz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i16> %v
@@ -292,29 +229,16 @@ define <4 x i16> @vp_cttz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i16> @vp_cttz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i16> %v
@@ -323,29 +247,16 @@ define <4 x i16> @vp_cttz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 define <8 x i16> @vp_cttz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i16> %v
@@ -354,29 +265,16 @@ define <8 x i16> @vp_cttz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i16> @vp_cttz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i16> %v
@@ -385,29 +283,16 @@ define <8 x i16> @vp_cttz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 define <16 x i16> @vp_cttz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i16> %v
@@ -416,29 +301,16 @@ define <16 x i16> @vp_cttz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i16> @vp_cttz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i16> %v
@@ -447,30 +319,17 @@ define <16 x i16> @vp_cttz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 define <2 x i32> @vp_cttz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i32> %v
@@ -479,30 +338,17 @@ define <2 x i32> @vp_cttz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i32> @vp_cttz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i32> %v
@@ -511,30 +357,17 @@ define <2 x i32> @vp_cttz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i32> %v
@@ -543,30 +376,17 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i32> @vp_cttz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i32> %v
@@ -575,30 +395,17 @@ define <4 x i32> @vp_cttz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 define <8 x i32> @vp_cttz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wx v10, v12, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i32> %v
@@ -607,30 +414,17 @@ define <8 x i32> @vp_cttz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i32> @vp_cttz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-NEXT:    vnsrl.wx v10, v12, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i32> %v
@@ -639,30 +433,17 @@ define <8 x i32> @vp_cttz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 define <16 x i32> @vp_cttz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vnsrl.wx v12, v16, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i32> %v
@@ -671,1240 +452,286 @@ define <16 x i32> @vp_cttz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i32> @vp_cttz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_v16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-NEXT:    vnsrl.wx v12, v16, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i32> %v
 }
 
 define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vi v9, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vadd.vi v9, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v9, v9
+; CHECK-NEXT:    vsrl.vx v9, v9, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %v
 }
 
 define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v2i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vi v9, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v2i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vadd.vi v9, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v9, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v9, v9
+; CHECK-NEXT:    vsrl.vx v9, v9, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v9, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 false, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i64> %v
 }
 
 define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vi v10, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vadd.vi v10, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v10, v10
+; CHECK-NEXT:    vsrl.vx v10, v10, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
 }
 
 define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v4i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vi v10, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v4i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vadd.vi v10, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v10, v10
+; CHECK-NEXT:    vsrl.vx v10, v10, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v10, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 false, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
 }
 
 define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vi v12, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vadd.vi v12, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v12, v12
+; CHECK-NEXT:    vsrl.vx v12, v12, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %v
 }
 
 define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v8i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vi v12, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v8i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vadd.vi v12, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a1
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v12, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v12, v12
+; CHECK-NEXT:    vsrl.vx v12, v12, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v12, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 false, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i64> %v
 }
 
 define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v15i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v15i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v15i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
 }
 
 define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v15i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v15i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v15i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
 }
 
 define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
 }
 
 define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v16i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v16i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v16, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsub.vx v8, v16, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
 }
 
 define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB34_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v7, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB34_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v8, -1, v0.t
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a5, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a6, a1, a3
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    slli a1, a5, 32
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    slli a2, a4, 32
-; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a4, a0, -16
-; RV64-NEXT:    sltu a0, a0, a4
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a4, a0, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v24, v16, v24
+; CHECK-NEXT:    vmseq.vi v7, v16, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v24
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
+; CHECK-NEXT:    vmerge.vxm v8, v8, a3, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vxm v16, v16, a3, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB35_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1
-; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v0, v16, -1
-; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vand.vv v0, v16, v0
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v16, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB35_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB35_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a4, a4, -241
-; RV64-NEXT:    addi a5, a5, 257
-; RV64-NEXT:    slli a6, a2, 32
-; RV64-NEXT:    add a2, a2, a6
-; RV64-NEXT:    slli a6, a3, 32
-; RV64-NEXT:    add a3, a3, a6
-; RV64-NEXT:    slli a6, a4, 32
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v16, -1
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vand.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a3
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v24, v16, v24
+; CHECK-NEXT:    vmseq.vi v7, v16, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    li a3, 64
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v24
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
+; CHECK-NEXT:    vmerge.vxm v8, v8, a3, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vxm v16, v16, a3, v0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -1912,22 +739,17 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i8> %v
@@ -1936,22 +758,17 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v2i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i8> %v
@@ -1960,22 +777,17 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i8> %v
@@ -1984,22 +796,17 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v4i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i8> %v
@@ -2008,22 +815,17 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i8> %v
@@ -2032,22 +834,17 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v8i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i8> %v
@@ -2056,22 +853,17 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i8> %v
@@ -2080,22 +872,17 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v16i8_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i8> %v
@@ -2104,29 +891,13 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i16> %v
@@ -2135,29 +906,13 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v2i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i16> %v
@@ -2166,29 +921,13 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i16> %v
@@ -2197,29 +936,13 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v4i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i16> %v
@@ -2228,29 +951,13 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i16> %v
@@ -2259,29 +966,13 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v8i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wi v10, v8, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i16> %v
@@ -2290,29 +981,13 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i16> %v
@@ -2321,29 +996,13 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v16i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i16> %v
@@ -2352,30 +1011,14 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i32> %v
@@ -2384,30 +1027,14 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v2i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vnsrl.wx v8, v9, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %v = call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i32> %v
@@ -2416,30 +1043,14 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wx v10, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i32> %v
@@ -2448,30 +1059,14 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v4i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v9, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v10, v8, v9
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    vnsrl.wx v10, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i32> %v
@@ -2480,30 +1075,14 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wx v12, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i32> %v
@@ -2512,30 +1091,14 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v8i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v12, v8, v10
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    vnsrl.wx v12, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %v = call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i32> %v
@@ -2544,30 +1107,14 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wx v16, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i32> %v
@@ -2576,1240 +1123,244 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_v16i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v8, -1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v12, v8
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vand.vv v16, v8, v12
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    vnsrl.wx v16, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i32> %v
 }
 
 define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vi v9, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vadd.vi v9, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %v
 }
 
 define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vi v9, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vadd.vi v9, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 true, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i64> %v
 }
 
 define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vi v10, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vadd.vi v10, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %v
 }
 
 define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vi v10, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vadd.vi v10, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 true, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
 }
 
 define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vi v12, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vadd.vi v12, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %v
 }
 
 define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vi v12, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vadd.vi v12, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a1
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 true, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i64> %v
 }
 
 define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v15i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v15i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v15i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
 }
 
 define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
 }
 
 define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
 }
 
 define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vand.vv v24, v16, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a3, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    slli a5, a2, 32
-; RV64-NEXT:    add a2, a2, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a1
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
 }
 
 define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB70_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB70_2:
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v7, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB70_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB70_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v8, -1, v0.t
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addi a1, a1, 1365
-; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a5, a3, -241
-; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a6, a1, a3
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    slli a1, a5, 32
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    slli a2, a4, 32
-; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a4, a0, -16
-; RV64-NEXT:    sltu a0, a0, a4
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a4, a0, a4
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
-; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vmul.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v16, v16, v24
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB71_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB71_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1
-; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v0, v16, -1
-; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vand.vv v0, v16, v0
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v16, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB71_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB71_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v8, -1
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addi a2, a2, 1365
-; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a4, a4, -241
-; RV64-NEXT:    addi a5, a5, 257
-; RV64-NEXT:    slli a6, a2, 32
-; RV64-NEXT:    add a2, a2, a6
-; RV64-NEXT:    slli a6, a3, 32
-; RV64-NEXT:    add a3, a3, a6
-; RV64-NEXT:    slli a6, a4, 32
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v24, v16, -1
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vand.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a3
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
-; RV64-NEXT:    ret
+; CHECK-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vrsub.vi v24, v8, 0
+; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    li a2, 1023
+; CHECK-NEXT:    vand.vv v8, v8, v24
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
+; CHECK-NEXT:    vand.vv v16, v16, v24
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-NEXT:    vsub.vx v8, v8, a2
+; CHECK-NEXT:    vsub.vx v16, v16, a2
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}



More information about the llvm-commits mailing list