[llvm] [RISCV] Construct constants via instructions if materialization is costly (PR #87036)

Wang Pengcheng via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 28 21:35:55 PDT 2024


https://github.com/wangpc-pp created https://github.com/llvm/llvm-project/pull/87036

For RISCV, it is costly to materialize constants used in lowering
`ISD::CTPOP`/`ISD::VP_CTPOP`.

We can query the materialization cost via `RISCVMatInt::getIntMatCost`
and if the cost is larger than 2, we should construct the constant
via two instructions.

This fixes #86207.


>From 26d1fcc7a3eb56071a0824ec17ef52b7fd88bc34 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Fri, 29 Mar 2024 12:35:35 +0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  201 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |    3 +
 llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll    | 1608 ++++--
 .../CodeGen/RISCV/ctz_zero_return_test.ll     |  107 +-
 llvm/test/CodeGen/RISCV/pr56457.ll            |   24 +-
 .../RISCV/rv64-legal-i32/rv64xtheadbb.ll      |   57 +-
 .../CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll   |   95 +-
 llvm/test/CodeGen/RISCV/rv64xtheadbb.ll       |  187 +-
 llvm/test/CodeGen/RISCV/rv64zbb.ll            |  383 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll    |  664 ++-
 llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll   |  372 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       | 1946 ++++---
 llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll    |  836 ++-
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        | 2134 ++++----
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        | 4386 +++++++++-------
 .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll   |  280 +-
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       | 2059 +++++---
 .../CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll  |  164 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        | 4566 ++++++++++-------
 .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll   |  392 +-
 llvm/test/CodeGen/RISCV/sextw-removal.ll      |   58 +-
 21 files changed, 11809 insertions(+), 8713 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e6814c5f71a09b..031030990d4405 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -391,7 +391,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
     }
   } else if (!Subtarget.hasVendorXCVbitmanip()) {
-    setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
+    setOperationAction(ISD::CTTZ, XLenVT, Expand);
+    setOperationAction(ISD::CTPOP, XLenVT,
+                       Subtarget.is64Bit() ? Custom : Expand);
     if (RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
   }
@@ -901,11 +903,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                            VT, Custom);
       } else {
         setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand);
-        setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand);
+        setOperationAction({ISD::CTLZ, ISD::CTTZ}, VT, Expand);
         setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
-                            ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
+                            ISD::VP_CTTZ_ZERO_UNDEF},
                            VT, Expand);
 
+        setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom);
+
         // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
         // range of f32.
         EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
@@ -1238,6 +1242,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                               ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP},
                              VT, Custom);
         } else {
+          setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom);
           // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
           // range of f32.
           EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
@@ -6746,8 +6751,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::UDIV:
   case ISD::UREM:
   case ISD::BSWAP:
-  case ISD::CTPOP:
     return lowerToScalableOp(Op, DAG);
+  case ISD::CTPOP: {
+    if (Op.getValueType().isScalarInteger())
+      return lowerScalarCTPOP(Op, DAG);
+    if (Subtarget.hasStdExtZvbb())
+      return lowerToScalableOp(Op, DAG);
+    return lowerVectorCTPOP(Op, DAG);
+  }
+  case ISD::VP_CTPOP:
+    if (Subtarget.hasStdExtZvbb())
+      return lowerVPOp(Op, DAG);
+    return lowerVectorCTPOP(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -6972,8 +6987,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     if (Subtarget.hasStdExtZvbb())
       return lowerVPOp(Op, DAG);
     return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
-  case ISD::VP_CTPOP:
-    return lowerVPOp(Op, DAG);
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     return lowerVPStridedLoad(Op, DAG);
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
@@ -10755,6 +10768,182 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
   return Max;
 }
 
+SDValue RISCVTargetLowering::lowerScalarCTPOP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  MVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()).getSimpleVT();
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "lowerScalarCTPOP not implemented for this type.");
+
+  SDValue V = Op.getOperand(0);
+
+  // This is same algorithm of TargetLowering::expandCTPOP from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  // 0x0F0F0F0F...
+  const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  SDValue Mask0F = DAG.getConstant(Constant0F, DL, VT, false, true);
+  // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+  const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
+  SDValue Mask33 =
+      RISCVMatInt::getIntMatCost(Constant33, VT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(ISD::XOR, DL, VT, Mask0F,
+                        DAG.getNode(ISD::SHL, DL, VT, Mask0F,
+                                    DAG.getShiftAmountConstant(2, VT, DL)))
+          : DAG.getConstant(Constant33, DL, VT);
+  // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+  const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+  SDValue Mask55 =
+      RISCVMatInt::getIntMatCost(Constant55, VT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(ISD::XOR, DL, VT, Mask33,
+                        DAG.getNode(ISD::SHL, DL, VT, Mask33,
+                                    DAG.getShiftAmountConstant(1, VT, DL)))
+          : DAG.getConstant(Constant55, DL, VT);
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  V = DAG.getNode(ISD::SUB, DL, VT, V,
+                  DAG.getNode(ISD::AND, DL, VT,
+                              DAG.getNode(ISD::SRL, DL, VT, V,
+                                          DAG.getConstant(1, DL, ShVT)),
+                              Mask55));
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  V = DAG.getNode(ISD::ADD, DL, VT, DAG.getNode(ISD::AND, DL, VT, V, Mask33),
+                  DAG.getNode(ISD::AND, DL, VT,
+                              DAG.getNode(ISD::SRL, DL, VT, V,
+                                          DAG.getConstant(2, DL, ShVT)),
+                              Mask33));
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  V = DAG.getNode(ISD::AND, DL, VT,
+                  DAG.getNode(ISD::ADD, DL, VT, V,
+                              DAG.getNode(ISD::SRL, DL, VT, V,
+                                          DAG.getConstant(4, DL, ShVT))),
+                  Mask0F);
+
+  // v = (v * 0x01010101...) >> (Len - 8)
+  // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+  const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
+  SDValue Mask01 =
+      RISCVMatInt::getIntMatCost(Constant01, VT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(ISD::AND, DL, VT, Mask0F,
+                        DAG.getNode(ISD::SRL, DL, VT, Mask0F,
+                                    DAG.getShiftAmountConstant(3, VT, DL)))
+          : DAG.getConstant(Constant01, DL, VT);
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, V, Mask01),
+                     DAG.getConstant(Len - 8, DL, ShVT));
+}
+
+SDValue RISCVTargetLowering::lowerVectorCTPOP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "lowerVectorCTPOP not implemented for this type.");
+
+  SDValue V = Op.getOperand(0);
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    V = convertToScalableVector(ContainerVT, V, DAG, Subtarget);
+  }
+
+  SDValue Mask, VL;
+  if (Op->getOpcode() == ISD::VP_CTPOP) {
+    Mask = Op->getOperand(1);
+    if (VT.isFixedLengthVector())
+      Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
+                                     Subtarget);
+    VL = Op->getOperand(2);
+  } else
+    std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+  // This is same algorithm of TargetLowering::expandVPCTPOP from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+  // 0x0F0F0F0F...
+  const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  SDValue Mask0F = DAG.getConstant(Constant0F, DL, ContainerVT);
+  // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+  const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
+  SDValue Mask33 =
+      RISCVMatInt::getIntMatCost(Constant33, ContainerVT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask0F,
+                        DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask0F,
+                                    DAG.getConstant(2, DL, ContainerVT),
+                                    DAG.getUNDEF(ContainerVT), Mask, VL),
+                        DAG.getUNDEF(ContainerVT), Mask, VL)
+          : DAG.getConstant(Constant33, DL, ContainerVT);
+  // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+  const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+  SDValue Mask55 =
+      RISCVMatInt::getIntMatCost(Constant55, ContainerVT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask33,
+                        DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask33,
+                                    DAG.getConstant(1, DL, ContainerVT),
+                                    DAG.getUNDEF(ContainerVT), Mask, VL),
+                        DAG.getUNDEF(ContainerVT), Mask, VL)
+          : DAG.getConstant(Constant55, DL, ContainerVT);
+
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5;
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  Tmp1 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT,
+                     DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+                                 DAG.getConstant(1, DL, ContainerVT),
+                                 DAG.getUNDEF(ContainerVT), Mask, VL),
+                     Mask55, DAG.getUNDEF(ContainerVT), Mask, VL);
+  V = DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, V, Tmp1,
+                  DAG.getUNDEF(ContainerVT), Mask, VL);
+
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  Tmp2 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, V, Mask33,
+                     DAG.getUNDEF(ContainerVT), Mask, VL);
+  Tmp3 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT,
+                     DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+                                 DAG.getConstant(2, DL, ContainerVT),
+                                 DAG.getUNDEF(ContainerVT), Mask, VL),
+                     Mask33, DAG.getUNDEF(ContainerVT), Mask, VL);
+  V = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, Tmp2, Tmp3,
+                  DAG.getUNDEF(ContainerVT), Mask, VL);
+
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Tmp4 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+                     DAG.getConstant(4, DL, ContainerVT),
+                     DAG.getUNDEF(ContainerVT), Mask, VL),
+  Tmp5 = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, V, Tmp4,
+                     DAG.getUNDEF(ContainerVT), Mask, VL);
+  V = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Tmp5, Mask0F,
+                  DAG.getUNDEF(ContainerVT), Mask, VL);
+
+  if (Len > 8) {
+    // v = (v * 0x01010101...) >> (Len - 8)
+    // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+    const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
+    SDValue Mask01 =
+        RISCVMatInt::getIntMatCost(
+            Constant01, ContainerVT.getScalarSizeInBits(), Subtarget) > 2
+            ? DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Mask0F,
+                          DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Mask0F,
+                                      DAG.getConstant(3, DL, ContainerVT),
+                                      DAG.getUNDEF(ContainerVT), Mask, VL),
+                          DAG.getUNDEF(ContainerVT), Mask, VL)
+            : DAG.getConstant(Constant01, DL, ContainerVT);
+    V = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT,
+                    DAG.getNode(RISCVISD::MUL_VL, DL, ContainerVT, V, Mask01,
+                                DAG.getUNDEF(ContainerVT), Mask, VL),
+                    DAG.getConstant(Len - 8, DL, ContainerVT),
+                    DAG.getUNDEF(ContainerVT), Mask, VL);
+  }
+
+  if (VT.isFixedLengthVector())
+    V = convertFromScalableVector(VT, V, DAG, Subtarget);
+  return V;
+}
+
 SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV(
     SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index c11b1464757c7f..cc8a18d9088106 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -959,6 +959,9 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerScalarCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorCTPOP(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b396..1eaf91096336f3 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -53,28 +53,77 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
 ; RV32_NOZBB-NEXT:    li a0, 8
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_cttz_i8:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    andi a1, a0, 255
-; RV64NOZBB-NEXT:    beqz a1, .LBB0_2
-; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV64NOZBB-NEXT:    addi a1, a0, -1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 51
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    andi a0, a0, 51
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a0, a0, 15
-; RV64NOZBB-NEXT:    ret
-; RV64NOZBB-NEXT:  .LBB0_2:
-; RV64NOZBB-NEXT:    li a0, 8
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_cttz_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    beqz a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    andi a3, a0, 255
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    and a0, a3, a2
+; RV64I-NEXT:    srli a3, a3, 2
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    li a0, 8
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_cttz_i8:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    andi a1, a0, 255
+; RV64M-NEXT:    beqz a1, .LBB0_2
+; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    andi a3, a0, 255
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    andi a0, a0, 85
+; RV64M-NEXT:    sub a3, a3, a0
+; RV64M-NEXT:    and a0, a3, a2
+; RV64M-NEXT:    srli a3, a3, 2
+; RV64M-NEXT:    and a2, a3, a2
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
+; RV64M-NEXT:  .LBB0_2:
+; RV64M-NEXT:    li a0, 8
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i8:
 ; RV32ZBB:       # %bb.0:
@@ -154,35 +203,83 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    li a0, 16
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_cttz_i16:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    beqz a1, .LBB1_2
-; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV64NOZBB-NEXT:    addi a1, a0, -1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
-; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 15
-; RV64NOZBB-NEXT:    slli a0, a0, 52
-; RV64NOZBB-NEXT:    srli a0, a0, 60
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    ret
-; RV64NOZBB-NEXT:  .LBB1_2:
-; RV64NOZBB-NEXT:    li a0, 16
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_cttz_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    beqz a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 5
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    li a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_cttz_i16:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    beqz a1, .LBB1_2
+; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 5
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    slli a0, a0, 48
+; RV64M-NEXT:    srli a0, a0, 48
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
+; RV64M-NEXT:  .LBB1_2:
+; RV64M-NEXT:    li a0, 16
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i16:
 ; RV32ZBB:       # %bb.0:
@@ -422,16 +519,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI3_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI3_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -475,16 +589,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    beqz a0, .LBB3_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
-; RV64M-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64M-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; RV64M-NEXT:    neg a2, a0
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 349525
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    slli a5, a4, 32
+; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
 ; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srli a0, a0, 58
-; RV64M-NEXT:    lui a1, %hi(.LCPI3_1)
-; RV64M-NEXT:    addi a1, a1, %lo(.LCPI3_1)
-; RV64M-NEXT:    add a0, a1, a0
-; RV64M-NEXT:    lbu a0, 0(a0)
+; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
 ; RV64M-NEXT:  .LBB3_2:
 ; RV64M-NEXT:    li a0, 64
@@ -565,22 +696,65 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
 ; RV32_NOZBB-NEXT:    andi a0, a0, 15
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_cttz_i8_zero_undef:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    addi a1, a0, -1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 51
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    andi a0, a0, 51
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a0, a0, 15
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_cttz_i8_zero_undef:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    andi a3, a0, 255
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    and a0, a3, a2
+; RV64I-NEXT:    srli a3, a3, 2
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_cttz_i8_zero_undef:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    andi a3, a0, 255
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    andi a0, a0, 85
+; RV64M-NEXT:    sub a3, a3, a0
+; RV64M-NEXT:    and a0, a3, a2
+; RV64M-NEXT:    srli a3, a3, 2
+; RV64M-NEXT:    and a2, a3, a2
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
 ; RV32ZBB:       # %bb.0:
@@ -640,29 +814,71 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_cttz_i16_zero_undef:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    addi a1, a0, -1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
-; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 15
-; RV64NOZBB-NEXT:    slli a0, a0, 52
-; RV64NOZBB-NEXT:    srli a0, a0, 60
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_cttz_i16_zero_undef:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 5
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_cttz_i16_zero_undef:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 5
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    slli a0, a0, 48
+; RV64M-NEXT:    srli a0, a0, 48
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
 ; RV32ZBB:       # %bb.0:
@@ -846,16 +1062,33 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI7_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI7_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -889,16 +1122,33 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 ;
 ; RV64M-LABEL: test_cttz_i64_zero_undef:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV64M-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
-; RV64M-NEXT:    neg a2, a0
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 349525
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    slli a5, a4, 32
+; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
 ; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srli a0, a0, 58
-; RV64M-NEXT:    lui a1, %hi(.LCPI7_1)
-; RV64M-NEXT:    addi a1, a1, %lo(.LCPI7_1)
-; RV64M-NEXT:    add a0, a1, a0
-; RV64M-NEXT:    lbu a0, 0(a0)
+; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i64_zero_undef:
@@ -984,35 +1234,91 @@ define i8 @test_ctlz_i8(i8 %a) nounwind {
 ; RV32_NOZBB-NEXT:    li a0, 8
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_ctlz_i8:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    andi a1, a0, 255
-; RV64NOZBB-NEXT:    beqz a1, .LBB8_2
-; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV64NOZBB-NEXT:    slli a1, a0, 56
-; RV64NOZBB-NEXT:    srli a1, a1, 57
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 56
-; RV64NOZBB-NEXT:    srli a1, a1, 58
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 56
-; RV64NOZBB-NEXT:    srli a1, a1, 60
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 51
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    andi a0, a0, 51
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a0, a0, 15
-; RV64NOZBB-NEXT:    ret
-; RV64NOZBB-NEXT:  .LBB8_2:
-; RV64NOZBB-NEXT:    li a0, 8
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_ctlz_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    beqz a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srli a1, a1, 57
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srli a1, a1, 58
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srli a1, a1, 60
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
+; RV64I-NEXT:    sub a1, a1, a0
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    slli a2, a0, 32
+; RV64I-NEXT:    add a2, a0, a2
+; RV64I-NEXT:    slli a0, a2, 2
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    and a3, a1, a0
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a1, a2, 3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB8_2:
+; RV64I-NEXT:    li a0, 8
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_ctlz_i8:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    andi a1, a0, 255
+; RV64M-NEXT:    beqz a1, .LBB8_2
+; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    slli a1, a0, 56
+; RV64M-NEXT:    srli a1, a1, 57
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 56
+; RV64M-NEXT:    srli a1, a1, 58
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 56
+; RV64M-NEXT:    srli a1, a1, 60
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    andi a1, a0, 255
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    andi a0, a0, 85
+; RV64M-NEXT:    sub a1, a1, a0
+; RV64M-NEXT:    lui a0, 61681
+; RV64M-NEXT:    addiw a0, a0, -241
+; RV64M-NEXT:    slli a2, a0, 32
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    slli a2, a0, 2
+; RV64M-NEXT:    xor a2, a2, a0
+; RV64M-NEXT:    and a3, a1, a2
+; RV64M-NEXT:    srli a1, a1, 2
+; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    add a1, a3, a1
+; RV64M-NEXT:    srli a2, a1, 4
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    and a1, a1, a0
+; RV64M-NEXT:    srli a2, a0, 3
+; RV64M-NEXT:    and a0, a2, a0
+; RV64M-NEXT:    mul a0, a1, a0
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
+; RV64M-NEXT:  .LBB8_2:
+; RV64M-NEXT:    li a0, 8
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctlz_i8:
 ; RV32ZBB:       # %bb.0:
@@ -1085,44 +1391,103 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    li a0, 16
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_ctlz_i16:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    beqz a1, .LBB9_2
-; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV64NOZBB-NEXT:    srli a1, a1, 49
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 50
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 52
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 56
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
-; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 15
-; RV64NOZBB-NEXT:    slli a0, a0, 52
-; RV64NOZBB-NEXT:    srli a0, a0, 60
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    ret
-; RV64NOZBB-NEXT:  .LBB9_2:
-; RV64NOZBB-NEXT:    li a0, 16
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_ctlz_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    beqz a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srli a1, a1, 49
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 50
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 52
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a0
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    slli a2, a0, 32
+; RV64I-NEXT:    add a2, a0, a2
+; RV64I-NEXT:    slli a0, a2, 2
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    and a3, a1, a0
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a1, a2, 3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    li a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_ctlz_i16:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    beqz a1, .LBB9_2
+; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    srli a1, a1, 49
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 50
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 52
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 56
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    lui a1, 16
+; RV64M-NEXT:    addiw a1, a1, -1
+; RV64M-NEXT:    and a1, a0, a1
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    lui a2, 5
+; RV64M-NEXT:    addiw a2, a2, 1365
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    sub a1, a1, a0
+; RV64M-NEXT:    lui a0, 61681
+; RV64M-NEXT:    addiw a0, a0, -241
+; RV64M-NEXT:    slli a2, a0, 32
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    slli a2, a0, 2
+; RV64M-NEXT:    xor a2, a2, a0
+; RV64M-NEXT:    and a3, a1, a2
+; RV64M-NEXT:    srli a1, a1, 2
+; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    add a1, a3, a1
+; RV64M-NEXT:    srli a2, a1, 4
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    and a1, a1, a0
+; RV64M-NEXT:    srli a2, a0, 3
+; RV64M-NEXT:    and a0, a2, a0
+; RV64M-NEXT:    mul a0, a1, a0
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
+; RV64M-NEXT:  .LBB9_2:
+; RV64M-NEXT:    li a0, 16
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctlz_i16:
 ; RV32ZBB:       # %bb.0:
@@ -1222,22 +1587,26 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -1305,22 +1674,26 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    slli a0, a0, 32
+; RV64M-NEXT:    srli a0, a0, 32
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addi a1, a1, 257
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srliw a0, a0, 24
+; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
 ; RV64M-NEXT:  .LBB10_2:
 ; RV64M-NEXT:    li a0, 32
@@ -1466,25 +1839,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1583,25 +1952,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64M-NEXT:    add a2, a2, a3
 ; RV64M-NEXT:    and a1, a1, a2
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    lui a1, 61681
 ; RV64M-NEXT:    addiw a1, a1, -241
 ; RV64M-NEXT:    slli a2, a1, 32
 ; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -1673,29 +2038,79 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
 ; RV32_NOZBB-NEXT:    andi a0, a0, 15
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_ctlz_i8_zero_undef:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    slli a1, a0, 56
-; RV64NOZBB-NEXT:    srli a1, a1, 57
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 56
-; RV64NOZBB-NEXT:    srli a1, a1, 58
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 56
-; RV64NOZBB-NEXT:    srli a1, a1, 60
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 51
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    andi a0, a0, 51
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a0, a0, 15
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_ctlz_i8_zero_undef:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srli a1, a1, 57
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srli a1, a1, 58
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srli a1, a1, 60
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
+; RV64I-NEXT:    sub a1, a1, a0
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    slli a2, a0, 32
+; RV64I-NEXT:    add a2, a0, a2
+; RV64I-NEXT:    slli a0, a2, 2
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    and a3, a1, a0
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a1, a2, 3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_ctlz_i8_zero_undef:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    slli a1, a0, 56
+; RV64M-NEXT:    srli a1, a1, 57
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 56
+; RV64M-NEXT:    srli a1, a1, 58
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 56
+; RV64M-NEXT:    srli a1, a1, 60
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    andi a1, a0, 255
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    andi a0, a0, 85
+; RV64M-NEXT:    sub a1, a1, a0
+; RV64M-NEXT:    lui a0, 61681
+; RV64M-NEXT:    addiw a0, a0, -241
+; RV64M-NEXT:    slli a2, a0, 32
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    slli a2, a0, 2
+; RV64M-NEXT:    xor a2, a2, a0
+; RV64M-NEXT:    and a3, a1, a2
+; RV64M-NEXT:    srli a1, a1, 2
+; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    add a1, a3, a1
+; RV64M-NEXT:    srli a2, a1, 4
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    and a1, a1, a0
+; RV64M-NEXT:    srli a2, a0, 3
+; RV64M-NEXT:    and a0, a2, a0
+; RV64M-NEXT:    mul a0, a1, a0
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
 ; RV32ZBB:       # %bb.0:
@@ -1763,39 +2178,93 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 49
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 50
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 52
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 56
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
-; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 15
-; RV64NOZBB-NEXT:    slli a0, a0, 52
-; RV64NOZBB-NEXT:    srli a0, a0, 60
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_ctlz_i16_zero_undef:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 49
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 50
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 52
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a0
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    slli a2, a0, 32
+; RV64I-NEXT:    add a2, a0, a2
+; RV64I-NEXT:    slli a0, a2, 2
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    and a3, a1, a0
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a1, a2, 3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_ctlz_i16_zero_undef:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 49
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 50
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 52
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    srli a1, a1, 56
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    lui a1, 16
+; RV64M-NEXT:    addiw a1, a1, -1
+; RV64M-NEXT:    and a1, a0, a1
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    lui a2, 5
+; RV64M-NEXT:    addiw a2, a2, 1365
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    sub a1, a1, a0
+; RV64M-NEXT:    lui a0, 61681
+; RV64M-NEXT:    addiw a0, a0, -241
+; RV64M-NEXT:    slli a2, a0, 32
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    slli a2, a0, 2
+; RV64M-NEXT:    xor a2, a2, a0
+; RV64M-NEXT:    and a3, a1, a2
+; RV64M-NEXT:    srli a1, a1, 2
+; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    add a1, a3, a1
+; RV64M-NEXT:    srli a2, a1, 4
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    and a1, a1, a0
+; RV64M-NEXT:    srli a2, a0, 3
+; RV64M-NEXT:    and a0, a2, a0
+; RV64M-NEXT:    mul a0, a1, a0
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
 ; RV32ZBB:       # %bb.0:
@@ -1887,22 +2356,26 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -1959,22 +2432,26 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    slli a0, a0, 32
+; RV64M-NEXT:    srli a0, a0, 32
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addi a1, a1, 257
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srliw a0, a0, 24
+; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
@@ -2115,25 +2592,21 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2227,25 +2700,21 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV64M-NEXT:    add a2, a2, a3
 ; RV64M-NEXT:    and a1, a1, a2
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    lui a1, 61681
 ; RV64M-NEXT:    addiw a1, a1, -241
 ; RV64M-NEXT:    slli a2, a1, 32
 ; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -2304,19 +2773,59 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
 ; RV32_NOZBB-NEXT:    andi a0, a0, 15
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_ctpop_i8:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 51
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    andi a0, a0, 51
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a0, a0, 15
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_ctpop_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    andi a3, a0, 255
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    srli a0, a3, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_ctpop_i8:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    andi a3, a0, 255
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    andi a0, a0, 85
+; RV64M-NEXT:    sub a3, a3, a0
+; RV64M-NEXT:    srli a0, a3, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    and a2, a3, a2
+; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctpop_i8:
 ; RV32ZBB:       # %bb.0:
@@ -2346,16 +2855,31 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: test_ctpop_i8:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    srli a1, a0, 1
-; RV64XTHEADBB-NEXT:    andi a1, a1, 85
-; RV64XTHEADBB-NEXT:    subw a0, a0, a1
-; RV64XTHEADBB-NEXT:    andi a1, a0, 51
-; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    andi a0, a0, 51
-; RV64XTHEADBB-NEXT:    add a0, a1, a0
-; RV64XTHEADBB-NEXT:    srli a1, a0, 4
-; RV64XTHEADBB-NEXT:    add a0, a0, a1
-; RV64XTHEADBB-NEXT:    andi a0, a0, 15
+; RV64XTHEADBB-NEXT:    addi sp, sp, -16
+; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    lui a1, 61681
+; RV64XTHEADBB-NEXT:    addiw a1, a1, -241
+; RV64XTHEADBB-NEXT:    slli a2, a1, 32
+; RV64XTHEADBB-NEXT:    add a1, a1, a2
+; RV64XTHEADBB-NEXT:    slli a2, a1, 2
+; RV64XTHEADBB-NEXT:    xor a2, a2, a1
+; RV64XTHEADBB-NEXT:    andi a3, a0, 255
+; RV64XTHEADBB-NEXT:    srli a0, a0, 1
+; RV64XTHEADBB-NEXT:    andi a0, a0, 85
+; RV64XTHEADBB-NEXT:    sub a3, a3, a0
+; RV64XTHEADBB-NEXT:    srli a0, a3, 2
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    and a2, a3, a2
+; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    srli a2, a0, 4
+; RV64XTHEADBB-NEXT:    add a0, a0, a2
+; RV64XTHEADBB-NEXT:    and a0, a0, a1
+; RV64XTHEADBB-NEXT:    srli a2, a1, 3
+; RV64XTHEADBB-NEXT:    and a1, a2, a1
+; RV64XTHEADBB-NEXT:    call __muldi3
+; RV64XTHEADBB-NEXT:    srli a0, a0, 56
+; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    addi sp, sp, 16
 ; RV64XTHEADBB-NEXT:    ret
   %1 = call i8 @llvm.ctpop.i8(i8 %a)
   ret i8 %1
@@ -2383,26 +2907,65 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_ctpop_i16:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
-; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 15
-; RV64NOZBB-NEXT:    slli a0, a0, 52
-; RV64NOZBB-NEXT:    srli a0, a0, 60
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_ctpop_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 5
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    and a3, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_ctpop_i16:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 5
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    slli a0, a0, 48
+; RV64M-NEXT:    srli a0, a0, 48
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 2
+; RV64M-NEXT:    and a3, a3, a2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a0, a3
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctpop_i16:
 ; RV32ZBB:       # %bb.0:
@@ -2439,23 +3002,33 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: test_ctpop_i16:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    srli a1, a0, 1
-; RV64XTHEADBB-NEXT:    lui a2, 5
-; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
-; RV64XTHEADBB-NEXT:    and a1, a1, a2
-; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 3
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
-; RV64XTHEADBB-NEXT:    srli a0, a0, 2
+; RV64XTHEADBB-NEXT:    addi sp, sp, -16
+; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    lui a1, 61681
+; RV64XTHEADBB-NEXT:    addiw a1, a1, -241
+; RV64XTHEADBB-NEXT:    slli a2, a1, 32
+; RV64XTHEADBB-NEXT:    add a1, a1, a2
+; RV64XTHEADBB-NEXT:    slli a2, a1, 2
+; RV64XTHEADBB-NEXT:    xor a2, a2, a1
+; RV64XTHEADBB-NEXT:    srli a3, a0, 1
+; RV64XTHEADBB-NEXT:    lui a4, 5
+; RV64XTHEADBB-NEXT:    addiw a4, a4, 1365
+; RV64XTHEADBB-NEXT:    and a3, a3, a4
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 15, 0
+; RV64XTHEADBB-NEXT:    sub a0, a0, a3
+; RV64XTHEADBB-NEXT:    srli a3, a0, 2
+; RV64XTHEADBB-NEXT:    and a3, a3, a2
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    add a0, a0, a3
+; RV64XTHEADBB-NEXT:    srli a2, a0, 4
+; RV64XTHEADBB-NEXT:    add a0, a0, a2
 ; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
-; RV64XTHEADBB-NEXT:    srli a1, a0, 4
-; RV64XTHEADBB-NEXT:    add a0, a0, a1
-; RV64XTHEADBB-NEXT:    andi a1, a0, 15
-; RV64XTHEADBB-NEXT:    slli a0, a0, 52
-; RV64XTHEADBB-NEXT:    srli a0, a0, 60
-; RV64XTHEADBB-NEXT:    add a0, a1, a0
+; RV64XTHEADBB-NEXT:    srli a2, a1, 3
+; RV64XTHEADBB-NEXT:    and a1, a2, a1
+; RV64XTHEADBB-NEXT:    call __muldi3
+; RV64XTHEADBB-NEXT:    srli a0, a0, 56
+; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    addi sp, sp, 16
 ; RV64XTHEADBB-NEXT:    ret
   %1 = call i16 @llvm.ctpop.i16(i16 %a)
   ret i16 %1
@@ -2494,26 +3067,30 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    and a3, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -2544,26 +3121,30 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ;
 ; RV64M-LABEL: test_ctpop_i32:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 349525
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    slli a0, a0, 32
+; RV64M-NEXT:    srli a0, a0, 32
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 2
+; RV64M-NEXT:    and a3, a3, a2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a0, a3
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addi a1, a1, 257
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    srliw a0, a0, 24
+; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_ctpop_i32:
@@ -2608,26 +3189,29 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    addi sp, sp, -16
 ; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64XTHEADBB-NEXT:    srli a1, a0, 1
-; RV64XTHEADBB-NEXT:    lui a2, 349525
-; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
-; RV64XTHEADBB-NEXT:    and a1, a1, a2
-; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 209715
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
-; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
-; RV64XTHEADBB-NEXT:    srli a1, a0, 4
-; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    lui a1, 61681
 ; RV64XTHEADBB-NEXT:    addiw a1, a1, -241
+; RV64XTHEADBB-NEXT:    slli a2, a1, 32
+; RV64XTHEADBB-NEXT:    add a1, a1, a2
+; RV64XTHEADBB-NEXT:    slli a2, a1, 2
+; RV64XTHEADBB-NEXT:    xor a2, a2, a1
+; RV64XTHEADBB-NEXT:    srli a3, a0, 1
+; RV64XTHEADBB-NEXT:    lui a4, 349525
+; RV64XTHEADBB-NEXT:    addiw a4, a4, 1365
+; RV64XTHEADBB-NEXT:    and a3, a3, a4
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 31, 0
+; RV64XTHEADBB-NEXT:    sub a0, a0, a3
+; RV64XTHEADBB-NEXT:    srli a3, a0, 2
+; RV64XTHEADBB-NEXT:    and a3, a3, a2
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    add a0, a0, a3
+; RV64XTHEADBB-NEXT:    srli a2, a0, 4
+; RV64XTHEADBB-NEXT:    add a0, a0, a2
 ; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 4112
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 257
+; RV64XTHEADBB-NEXT:    srli a2, a1, 3
+; RV64XTHEADBB-NEXT:    and a1, a2, a1
 ; RV64XTHEADBB-NEXT:    call __muldi3
-; RV64XTHEADBB-NEXT:    srliw a0, a0, 24
+; RV64XTHEADBB-NEXT:    srli a0, a0, 56
 ; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64XTHEADBB-NEXT:    addi sp, sp, 16
 ; RV64XTHEADBB-NEXT:    ret
@@ -2697,32 +3281,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2769,32 +3349,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64M-LABEL: test_ctpop_i64:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
-; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    lui a1, 61681
 ; RV64M-NEXT:    addiw a1, a1, -241
 ; RV64M-NEXT:    slli a2, a1, 32
 ; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 349525
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    slli a5, a4, 32
+; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -2873,32 +3449,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    addi sp, sp, -16
 ; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64XTHEADBB-NEXT:    srli a1, a0, 1
-; RV64XTHEADBB-NEXT:    lui a2, 349525
-; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
-; RV64XTHEADBB-NEXT:    slli a3, a2, 32
-; RV64XTHEADBB-NEXT:    add a2, a2, a3
-; RV64XTHEADBB-NEXT:    and a1, a1, a2
-; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 209715
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    slli a2, a1, 32
-; RV64XTHEADBB-NEXT:    add a1, a1, a2
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
-; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
-; RV64XTHEADBB-NEXT:    srli a1, a0, 4
-; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    lui a1, 61681
 ; RV64XTHEADBB-NEXT:    addiw a1, a1, -241
 ; RV64XTHEADBB-NEXT:    slli a2, a1, 32
 ; RV64XTHEADBB-NEXT:    add a1, a1, a2
+; RV64XTHEADBB-NEXT:    slli a2, a1, 2
+; RV64XTHEADBB-NEXT:    xor a2, a2, a1
+; RV64XTHEADBB-NEXT:    srli a3, a0, 1
+; RV64XTHEADBB-NEXT:    lui a4, 349525
+; RV64XTHEADBB-NEXT:    addiw a4, a4, 1365
+; RV64XTHEADBB-NEXT:    slli a5, a4, 32
+; RV64XTHEADBB-NEXT:    add a4, a4, a5
+; RV64XTHEADBB-NEXT:    and a3, a3, a4
+; RV64XTHEADBB-NEXT:    sub a0, a0, a3
+; RV64XTHEADBB-NEXT:    and a3, a0, a2
+; RV64XTHEADBB-NEXT:    srli a0, a0, 2
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    add a0, a3, a0
+; RV64XTHEADBB-NEXT:    srli a2, a0, 4
+; RV64XTHEADBB-NEXT:    add a0, a0, a2
 ; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 4112
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 257
-; RV64XTHEADBB-NEXT:    slli a2, a1, 32
-; RV64XTHEADBB-NEXT:    add a1, a1, a2
+; RV64XTHEADBB-NEXT:    srli a2, a1, 3
+; RV64XTHEADBB-NEXT:    and a1, a2, a1
 ; RV64XTHEADBB-NEXT:    call __muldi3
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 56
 ; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index adf614435b31d7..007bbf79d23639 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -85,24 +85,36 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 0(a0)
-; RV64I-NEXT:    neg a0, s0
-; RV64I-NEXT:    and a0, s0, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI0_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI0_0)(a1)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI0_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI0_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    seqz a1, s0
-; RV64I-NEXT:    addi a1, a1, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    andi a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 
@@ -548,24 +560,35 @@ define signext i32 @ctz4(i64 %b) nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    neg a0, a0
-; RV64I-NEXT:    and a0, s0, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI6_0)(a1)
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI6_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    seqz a1, s0
-; RV64I-NEXT:    addi a1, a1, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    andi a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 
@@ -712,25 +735,21 @@ define signext i32 @ctlz(i64 %b) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    srli a0, a0, 58
diff --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll
index ba08aa838bf992..bb7239862de1e0 100644
--- a/llvm/test/CodeGen/RISCV/pr56457.ll
+++ b/llvm/test/CodeGen/RISCV/pr56457.ll
@@ -28,25 +28,21 @@ define i15 @foo(i15 %x) nounwind {
 ; CHECK-NEXT:    slli a0, a0, 49
 ; CHECK-NEXT:    srli a0, a0, 49
 ; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addiw a1, a1, 819
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    and a2, a0, a1
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    add a0, a2, a0
-; CHECK-NEXT:    srli a1, a0, 4
-; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    lui a1, 61681
 ; CHECK-NEXT:    addiw a1, a1, -241
 ; CHECK-NEXT:    slli a2, a1, 32
 ; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 2
+; CHECK-NEXT:    xor a2, a2, a1
+; CHECK-NEXT:    and a3, a0, a2
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    srli a2, a0, 4
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    lui a1, 4112
-; CHECK-NEXT:    addiw a1, a1, 257
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    srli a0, a0, 56
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
index 73bfc6480b4d75..929cf7d35d674b 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
@@ -339,25 +339,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -591,16 +587,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
index 7feef4dad4116a..268488b14d8069 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
@@ -329,25 +329,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -514,16 +510,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -625,32 +638,28 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 1f62ea9f568191..7684a904ae9ce0 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -28,22 +28,26 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -83,22 +87,26 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    j .LBB1_3
@@ -148,22 +156,26 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a1, a0, 24
+; RV64I-NEXT:    srli a1, a0, 56
 ; RV64I-NEXT:  .LBB2_2: # %cond.end
 ; RV64I-NEXT:    sub a0, s0, a1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -208,22 +220,26 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    xori a0, a0, 31
 ; RV64I-NEXT:    snez a1, s0
 ; RV64I-NEXT:    addi a1, a1, -1
@@ -275,22 +291,26 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -339,25 +359,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -548,16 +564,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 2269d8d04c9cb0..b8d4d9706dc0ad 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -28,22 +28,26 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -81,22 +85,26 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    j .LBB1_3
@@ -144,22 +152,26 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a1, a0, 24
+; RV64I-NEXT:    srli a1, a0, 56
 ; RV64I-NEXT:  .LBB2_2: # %cond.end
 ; RV64I-NEXT:    sub a0, s0, a1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -202,22 +214,26 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    xori a0, a0, 31
 ; RV64I-NEXT:    snez a1, s0
 ; RV64I-NEXT:    addi a1, a1, -1
@@ -267,22 +283,26 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -329,25 +349,21 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -514,16 +530,33 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srli a0, a0, 58
-; RV64I-NEXT:    lui a1, %hi(.LCPI10_1)
-; RV64I-NEXT:    addi a1, a1, %lo(.LCPI10_1)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -546,26 +579,30 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a3, a2, 32
+; RV64I-NEXT:    add a2, a2, a3
+; RV64I-NEXT:    slli a3, a2, 2
+; RV64I-NEXT:    xor a3, a3, a2
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    sub a1, a1, a0
+; RV64I-NEXT:    srli a0, a1, 2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a1, a2, 3
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -659,27 +696,29 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lwu a0, 0(a0)
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    and a3, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -707,41 +746,48 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s5, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli s3, a1, 32
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a3, a2, 32
+; RV64I-NEXT:    add s4, a2, a3
+; RV64I-NEXT:    slli a2, s4, 2
+; RV64I-NEXT:    xor s5, a2, s4
+; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw s3, a2, 1365
-; RV64I-NEXT:    and a1, a1, s3
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw s4, a1, 819
-; RV64I-NEXT:    and a1, a0, s4
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, s4
+; RV64I-NEXT:    addiw s6, a2, 1365
+; RV64I-NEXT:    and a0, a0, s6
+; RV64I-NEXT:    sub a1, a1, a0
+; RV64I-NEXT:    srli a0, a1, 2
+; RV64I-NEXT:    and a0, a0, s5
+; RV64I-NEXT:    and a1, a1, s5
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw s5, a1, -241
-; RV64I-NEXT:    and a0, a0, s5
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw s1, a1, 257
+; RV64I-NEXT:    and a0, a0, s4
+; RV64I-NEXT:    srli a1, s4, 3
+; RV64I-NEXT:    and s1, a1, s4
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw s2, a0, 24
-; RV64I-NEXT:    srli a0, s0, 1
-; RV64I-NEXT:    and a0, a0, s3
-; RV64I-NEXT:    sub s0, s0, a0
-; RV64I-NEXT:    and a0, s0, s4
-; RV64I-NEXT:    srli s0, s0, 2
-; RV64I-NEXT:    and a1, s0, s4
+; RV64I-NEXT:    srli s2, a0, 56
+; RV64I-NEXT:    srli s0, s0, 1
+; RV64I-NEXT:    and a0, s0, s6
+; RV64I-NEXT:    sub a0, s3, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    and a1, a1, s5
+; RV64I-NEXT:    and a0, a0, s5
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    and a0, a0, s5
+; RV64I-NEXT:    and a0, a0, s4
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __muldi3
-; RV64I-NEXT:    srliw a1, a0, 24
+; RV64I-NEXT:    srli a1, a0, 56
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -750,6 +796,7 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 ; RV64I-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s5, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
@@ -877,32 +924,28 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 349525
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1007,37 +1050,33 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s5, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add s3, a1, a2
+; RV64I-NEXT:    slli a1, s3, 2
+; RV64I-NEXT:    xor s4, a1, s3
 ; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add s3, a2, a3
-; RV64I-NEXT:    and a1, a1, s3
+; RV64I-NEXT:    add s5, a2, a3
+; RV64I-NEXT:    and a1, a1, s5
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add s4, a1, a2
 ; RV64I-NEXT:    and a1, a0, s4
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, s4
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add s5, a1, a2
-; RV64I-NEXT:    and a0, a0, s5
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw s1, a1, 257
-; RV64I-NEXT:    slli a1, s1, 32
-; RV64I-NEXT:    add s1, s1, a1
+; RV64I-NEXT:    and a0, a0, s3
+; RV64I-NEXT:    srli a1, s3, 3
+; RV64I-NEXT:    and s1, a1, s3
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli s2, a0, 56
 ; RV64I-NEXT:    srli a0, s0, 1
-; RV64I-NEXT:    and a0, a0, s3
+; RV64I-NEXT:    and a0, a0, s5
 ; RV64I-NEXT:    sub s0, s0, a0
 ; RV64I-NEXT:    and a0, s0, s4
 ; RV64I-NEXT:    srli s0, s0, 2
@@ -1045,7 +1084,7 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    and a0, a0, s5
+; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srli a1, a0, 56
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index fc94f8c2a52797..4b1bfa48a95d82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1147,35 +1147,26 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
+; RV32I-NEXT:    vsll.vi v11, v10, 2
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    vadd.vv v12, v11, v11
+; RV32I-NEXT:    vxor.vv v12, v11, v12
+; RV32I-NEXT:    vand.vv v9, v9, v12
 ; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v11
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vand.vv v8, v8, v11
+; RV32I-NEXT:    vadd.vv v8, v9, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v9, v10, 3
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1199,32 +1190,27 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vmv.v.x v10, a0
+; RV64I-NEXT:    vsll.vi v11, v10, 2
+; RV64I-NEXT:    vxor.vx v11, v11, a0
+; RV64I-NEXT:    vadd.vv v12, v11, v11
+; RV64I-NEXT:    vxor.vv v12, v11, v12
+; RV64I-NEXT:    vand.vv v9, v9, v12
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vv v9, v8, v11
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v11
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v9, v10, 3
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1288,35 +1274,26 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
+; RV32I-NEXT:    vsll.vi v14, v12, 2
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    vadd.vv v16, v14, v14
+; RV32I-NEXT:    vxor.vv v16, v14, v16
+; RV32I-NEXT:    vand.vv v10, v10, v16
 ; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v14
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vand.vv v8, v8, v14
+; RV32I-NEXT:    vadd.vv v8, v10, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v10, v12, 3
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1340,32 +1317,27 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vmv.v.x v12, a0
+; RV64I-NEXT:    vsll.vi v14, v12, 2
+; RV64I-NEXT:    vxor.vx v14, v14, a0
+; RV64I-NEXT:    vadd.vv v16, v14, v14
+; RV64I-NEXT:    vxor.vv v16, v14, v16
+; RV64I-NEXT:    vand.vv v10, v10, v16
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vv v10, v8, v14
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v14
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v10, v12, 3
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1429,35 +1401,26 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
+; RV32I-NEXT:    vsll.vi v20, v16, 2
+; RV32I-NEXT:    vxor.vv v20, v16, v20
+; RV32I-NEXT:    vadd.vv v24, v20, v20
+; RV32I-NEXT:    vxor.vv v24, v20, v24
+; RV32I-NEXT:    vand.vv v12, v12, v24
 ; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v20
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vand.vv v8, v8, v20
+; RV32I-NEXT:    vadd.vv v8, v12, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v12, v16, 3
+; RV32I-NEXT:    vand.vv v12, v16, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1481,32 +1444,27 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vmv.v.x v16, a0
+; RV64I-NEXT:    vsll.vi v20, v16, 2
+; RV64I-NEXT:    vxor.vx v20, v20, a0
+; RV64I-NEXT:    vadd.vv v24, v20, v20
+; RV64I-NEXT:    vxor.vv v24, v20, v24
+; RV64I-NEXT:    vand.vv v12, v12, v24
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vv v12, v8, v20
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v20
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v12, v16, 3
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v12
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1554,6 +1512,12 @@ declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
 define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-LABEL: ctlz_nxv8i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    sub sp, sp, a0
+; RV32I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
 ; RV32I-NEXT:    vor.vv v8, v8, v16
@@ -1569,43 +1533,58 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-NEXT:    vsrl.vx v16, v8, a0
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v24, v8, v16
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT:    vsrl.vi v8, v8, 1
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v8, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vsll.vi v0, v8, 2
+; RV32I-NEXT:    vxor.vv v0, v8, v0
+; RV32I-NEXT:    vadd.vv v24, v0, v0
+; RV32I-NEXT:    vxor.vv v24, v0, v24
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vand.vv v24, v16, v24
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vsub.vv v16, v16, v24
+; RV32I-NEXT:    vand.vv v24, v16, v0
+; RV32I-NEXT:    vsrl.vi v16, v16, 2
+; RV32I-NEXT:    vand.vv v16, v16, v0
+; RV32I-NEXT:    vadd.vv v16, v24, v16
+; RV32I-NEXT:    vsrl.vi v24, v16, 4
+; RV32I-NEXT:    vadd.vv v16, v16, v24
+; RV32I-NEXT:    vand.vv v16, v16, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 3
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vmul.vv v8, v16, v8
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add sp, sp, a0
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ctlz_nxv8i64:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    sub sp, sp, a0
+; RV64I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v16
@@ -1620,36 +1599,49 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-NEXT:    li a0, 32
 ; RV64I-NEXT:    vsrl.vx v16, v8, a0
 ; RV64I-NEXT:    vor.vv v8, v8, v16
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v16, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v16, v8
-; RV64I-NEXT:    vsrl.vi v16, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    vnot.v v16, v8
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vmv.v.x v8, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT:    vsll.vi v24, v8, 2
+; RV64I-NEXT:    vxor.vx v24, v24, a0
+; RV64I-NEXT:    vadd.vv v0, v24, v24
+; RV64I-NEXT:    vxor.vv v0, v24, v0
+; RV64I-NEXT:    csrr a1, vlenb
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a1, sp, a1
+; RV64I-NEXT:    addi a1, a1, 16
+; RV64I-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT:    vsrl.vi v0, v16, 1
+; RV64I-NEXT:    csrr a1, vlenb
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a1, sp, a1
+; RV64I-NEXT:    addi a1, a1, 16
+; RV64I-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT:    vand.vv v0, v0, v8
+; RV64I-NEXT:    vsub.vv v16, v16, v0
+; RV64I-NEXT:    vand.vv v0, v16, v24
+; RV64I-NEXT:    vsrl.vi v16, v16, 2
+; RV64I-NEXT:    vand.vv v16, v16, v24
+; RV64I-NEXT:    vadd.vv v16, v0, v16
+; RV64I-NEXT:    vsrl.vi v24, v16, 4
+; RV64I-NEXT:    vadd.vv v16, v16, v24
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT:    vsrl.vi v8, v8, 3
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vv v8, v16, v8
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add sp, sp, a0
+; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; CHECK-F-LABEL: ctlz_nxv8i64:
@@ -2753,35 +2745,26 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
+; RV32I-NEXT:    vsll.vi v11, v10, 2
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    vadd.vv v12, v11, v11
+; RV32I-NEXT:    vxor.vv v12, v11, v12
+; RV32I-NEXT:    vand.vv v9, v9, v12
 ; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v11
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vand.vv v8, v8, v11
+; RV32I-NEXT:    vadd.vv v8, v9, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v9, v10, 3
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2805,32 +2788,27 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vmv.v.x v10, a0
+; RV64I-NEXT:    vsll.vi v11, v10, 2
+; RV64I-NEXT:    vxor.vx v11, v11, a0
+; RV64I-NEXT:    vadd.vv v12, v11, v11
+; RV64I-NEXT:    vxor.vv v12, v11, v12
+; RV64I-NEXT:    vand.vv v9, v9, v12
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vv v9, v8, v11
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v11
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v9, v10, 3
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2889,35 +2867,26 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
+; RV32I-NEXT:    vsll.vi v14, v12, 2
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    vadd.vv v16, v14, v14
+; RV32I-NEXT:    vxor.vv v16, v14, v16
+; RV32I-NEXT:    vand.vv v10, v10, v16
 ; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v14
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vand.vv v8, v8, v14
+; RV32I-NEXT:    vadd.vv v8, v10, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v10, v12, 3
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2941,32 +2910,27 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vmv.v.x v12, a0
+; RV64I-NEXT:    vsll.vi v14, v12, 2
+; RV64I-NEXT:    vxor.vx v14, v14, a0
+; RV64I-NEXT:    vadd.vv v16, v14, v14
+; RV64I-NEXT:    vxor.vv v16, v14, v16
+; RV64I-NEXT:    vand.vv v10, v10, v16
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vv v10, v8, v14
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v14
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v10, v12, 3
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3025,35 +2989,26 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
+; RV32I-NEXT:    vsll.vi v20, v16, 2
+; RV32I-NEXT:    vxor.vv v20, v16, v20
+; RV32I-NEXT:    vadd.vv v24, v20, v20
+; RV32I-NEXT:    vxor.vv v24, v20, v24
+; RV32I-NEXT:    vand.vv v12, v12, v24
 ; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v20
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vand.vv v8, v8, v20
+; RV32I-NEXT:    vadd.vv v8, v12, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v12, v16, 3
+; RV32I-NEXT:    vand.vv v12, v16, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -3077,32 +3032,27 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vmv.v.x v16, a0
+; RV64I-NEXT:    vsll.vi v20, v16, 2
+; RV64I-NEXT:    vxor.vx v20, v20, a0
+; RV64I-NEXT:    vadd.vv v24, v20, v20
+; RV64I-NEXT:    vxor.vv v24, v20, v24
+; RV64I-NEXT:    vand.vv v12, v12, v24
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vv v12, v8, v20
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v20
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v12, v16, 3
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v12
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3145,6 +3095,12 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-LABEL: ctlz_zero_undef_nxv8i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    sub sp, sp, a0
+; RV32I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
 ; RV32I-NEXT:    vor.vv v8, v8, v16
@@ -3160,43 +3116,58 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-NEXT:    vsrl.vx v16, v8, a0
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v24, v8, v16
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT:    vsrl.vi v8, v8, 1
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v8, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vsll.vi v0, v8, 2
+; RV32I-NEXT:    vxor.vv v0, v8, v0
+; RV32I-NEXT:    vadd.vv v24, v0, v0
+; RV32I-NEXT:    vxor.vv v24, v0, v24
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vand.vv v24, v16, v24
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vsub.vv v16, v16, v24
+; RV32I-NEXT:    vand.vv v24, v16, v0
+; RV32I-NEXT:    vsrl.vi v16, v16, 2
+; RV32I-NEXT:    vand.vv v16, v16, v0
+; RV32I-NEXT:    vadd.vv v16, v24, v16
+; RV32I-NEXT:    vsrl.vi v24, v16, 4
+; RV32I-NEXT:    vadd.vv v16, v16, v24
+; RV32I-NEXT:    vand.vv v16, v16, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 3
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vmul.vv v8, v16, v8
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add sp, sp, a0
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ctlz_zero_undef_nxv8i64:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    sub sp, sp, a0
+; RV64I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v16
@@ -3211,36 +3182,49 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-NEXT:    li a0, 32
 ; RV64I-NEXT:    vsrl.vx v16, v8, a0
 ; RV64I-NEXT:    vor.vv v8, v8, v16
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v16, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v16, v8
-; RV64I-NEXT:    vsrl.vi v16, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    vnot.v v16, v8
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vmv.v.x v8, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT:    vsll.vi v24, v8, 2
+; RV64I-NEXT:    vxor.vx v24, v24, a0
+; RV64I-NEXT:    vadd.vv v0, v24, v24
+; RV64I-NEXT:    vxor.vv v0, v24, v0
+; RV64I-NEXT:    csrr a1, vlenb
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a1, sp, a1
+; RV64I-NEXT:    addi a1, a1, 16
+; RV64I-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT:    vsrl.vi v0, v16, 1
+; RV64I-NEXT:    csrr a1, vlenb
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a1, sp, a1
+; RV64I-NEXT:    addi a1, a1, 16
+; RV64I-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT:    vand.vv v0, v0, v8
+; RV64I-NEXT:    vsub.vv v16, v16, v0
+; RV64I-NEXT:    vand.vv v0, v16, v24
+; RV64I-NEXT:    vsrl.vi v16, v16, 2
+; RV64I-NEXT:    vand.vv v16, v16, v24
+; RV64I-NEXT:    vadd.vv v16, v0, v16
+; RV64I-NEXT:    vsrl.vi v24, v16, 4
+; RV64I-NEXT:    vadd.vv v16, v16, v24
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT:    vsrl.vi v8, v8, 3
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vv v8, v16, v8
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add sp, sp, a0
+; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index c310274d685081..a33e7b54288d88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -675,37 +675,27 @@ declare <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32>)
 define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-LABEL: ctpop_nxv1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -713,34 +703,29 @@ define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
 ; RV64-NEXT:    lui a0, 61681
 ; RV64-NEXT:    addiw a0, a0, -241
 ; RV64-NEXT:    slli a1, a0, 32
 ; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a0
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -758,37 +743,27 @@ declare <vscale x 1 x i64> @llvm.ctpop.nxv1i64(<vscale x 1 x i64>)
 define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-LABEL: ctpop_nxv2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -796,34 +771,29 @@ define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    lui a0, 61681
 ; RV64-NEXT:    addiw a0, a0, -241
 ; RV64-NEXT:    slli a1, a0, 32
 ; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a0
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a0
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -841,37 +811,27 @@ declare <vscale x 2 x i64> @llvm.ctpop.nxv2i64(<vscale x 2 x i64>)
 define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-LABEL: ctpop_nxv4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2
+; RV32-NEXT:    vxor.vv v16, v12, v16
+; RV32-NEXT:    vadd.vv v20, v16, v16
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vand.vv v20, v24, v20
+; RV32-NEXT:    vsub.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v20, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -879,34 +839,29 @@ define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
 ; RV64-NEXT:    lui a0, 61681
 ; RV64-NEXT:    addiw a0, a0, -241
 ; RV64-NEXT:    slli a1, a0, 32
 ; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vsll.vi v16, v12, 2
+; RV64-NEXT:    vxor.vx v16, v16, a0
+; RV64-NEXT:    vadd.vv v20, v16, v16
+; RV64-NEXT:    vxor.vv v20, v16, v20
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v20, v24, v20
+; RV64-NEXT:    vsub.vv v8, v8, v20
+; RV64-NEXT:    vand.vv v20, v8, v16
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v20, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vsrl.vi v12, v12, 3
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -924,74 +879,107 @@ declare <vscale x 4 x i64> @llvm.ctpop.nxv4i64(<vscale x 4 x i64>)
 define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-LABEL: ctpop_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctpop_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    lui a0, 61681
 ; RV64-NEXT:    addiw a0, a0, -241
 ; RV64-NEXT:    slli a1, a0, 32
 ; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a0
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v0, v0, v16
+; RV64-NEXT:    vsub.vv v8, v8, v0
+; RV64-NEXT:    vand.vv v0, v8, v24
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctpop_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 2310f85b1fba93..fc7f50e4048666 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1221,37 +1221,27 @@ declare <vscale x 1 x i64> @llvm.vp.ctpop.nxv1i64(<vscale x 1 x i64>, <vscale x
 define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT:    vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1259,34 +1249,30 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT:    vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT:    vand.vv v11, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1303,37 +1289,27 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1341,34 +1317,30 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a1
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1389,37 +1361,27 @@ declare <vscale x 2 x i64> @llvm.vp.ctpop.nxv2i64(<vscale x 2 x i64>, <vscale x
 define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1427,34 +1389,30 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT:    vand.vv v14, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1471,37 +1429,27 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1509,34 +1457,30 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a1
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1557,37 +1501,27 @@ declare <vscale x 4 x i64> @llvm.vp.ctpop.nxv4i64(<vscale x 4 x i64>, <vscale x
 define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v20, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1595,34 +1529,30 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT:    vand.vv v20, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1639,37 +1569,27 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2
+; RV32-NEXT:    vxor.vv v16, v12, v16
+; RV32-NEXT:    vadd.vv v20, v16, v16
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vand.vv v20, v24, v20
+; RV32-NEXT:    vsub.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v20, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1677,34 +1597,30 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vsll.vi v16, v12, 2
+; RV64-NEXT:    vxor.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v20, v16, v16
+; RV64-NEXT:    vxor.vv v20, v16, v20
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v20, v24, v20
+; RV64-NEXT:    vsub.vv v8, v8, v20
+; RV64-NEXT:    vand.vv v20, v8, v16
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v20, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v12, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1725,74 +1641,163 @@ declare <vscale x 7 x i64> @llvm.vp.ctpop.nxv7i64(<vscale x 7 x i64>, <vscale x
 define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv7i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64:
@@ -1807,74 +1812,98 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64_unmasked:
@@ -1893,74 +1922,163 @@ declare <vscale x 8 x i64> @llvm.vp.ctpop.nxv8i64(<vscale x 8 x i64>, <vscale x
 define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64:
@@ -1975,74 +2093,98 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64_unmasked:
@@ -2064,20 +2206,19 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2089,101 +2230,121 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -2194,98 +2355,112 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2296,82 +2471,242 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a1, a2, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a2
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB46_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:  .LBB46_2:
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
+; RV64-NEXT:    sub a1, a0, a2
+; RV64-NEXT:    sltu a3, a0, a1
+; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    and a3, a3, a1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 40
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 24
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    bltu a0, a2, .LBB46_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB46_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a3, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -2407,162 +2742,215 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v16, v24
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v24, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v24, 4
-; RV32-NEXT:    vadd.vv v16, v24, v16
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v8, v24, v0
+; RV32-NEXT:    vadd.vv v0, v8, v8
+; RV32-NEXT:    vxor.vv v16, v8, v0
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v0, v0, 1
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v0, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vmul.vv v16, v16, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v16, v16, a2
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB47_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB47_2:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vsll.vi v8, v24, 2
+; RV32-NEXT:    vxor.vv v8, v24, v8
+; RV32-NEXT:    vadd.vv v0, v8, v8
+; RV32-NEXT:    vxor.vv v16, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vsub.vv v24, v8, v24
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v0, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsub.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v0, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB47_2
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    sub a1, a0, a2
+; RV64-NEXT:    sltu a3, a0, a1
+; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    and a3, a3, a1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v16, v0, v24
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v16, v24, v16
+; RV64-NEXT:    vsub.vv v16, v8, v16
+; RV64-NEXT:    vand.vv v24, v16, v0
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vv v16, v16, v0
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv8r.v v8, v24
+; RV64-NEXT:    vsrl.vi v24, v24, 3
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vmul.vv v16, v16, v24
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v16, v16, a3
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    bltu a0, a2, .LBB47_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    mv a0, a2
 ; RV64-NEXT:  .LBB47_2:
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsll.vi v24, v8, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v16, v24, v0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v0, v8, 1
+; RV64-NEXT:    vand.vv v16, v0, v16
+; RV64-NEXT:    vsub.vv v16, v8, v16
+; RV64-NEXT:    vand.vv v0, v16, v24
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vadd.vv v16, v0, v16
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 3
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vmul.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vx v8, v8, a3
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index b14cde25aa85b2..6ee63fc9a12e8a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -1135,41 +1135,31 @@ declare <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32>, i1)
 define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-LABEL: cttz_nxv1i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vsub.vx v9, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsll.vi v10, v9, 2
+; RV32I-NEXT:    vxor.vv v10, v9, v10
+; RV32I-NEXT:    vadd.vv v11, v10, v10
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v12, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v11, v12, v11
+; RV32I-NEXT:    vsub.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v11, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v11, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsrl.vi v10, v9, 3
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1177,38 +1167,33 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64I-LABEL: cttz_nxv1i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64I-NEXT:    vsub.vx v9, v8, a0
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v9
-; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v9, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v9, v8
-; RV64I-NEXT:    vsrl.vi v9, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v9
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vmv.v.x v9, a0
+; RV64I-NEXT:    vsll.vi v10, v9, 2
+; RV64I-NEXT:    vxor.vx v10, v10, a0
+; RV64I-NEXT:    vadd.vv v11, v10, v10
+; RV64I-NEXT:    vxor.vv v11, v10, v11
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v12, v8, a1
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vv v11, v12, v11
+; RV64I-NEXT:    vsub.vv v8, v8, v11
+; RV64I-NEXT:    vand.vv v11, v8, v10
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v10
+; RV64I-NEXT:    vadd.vv v8, v11, v8
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v9, v9, 3
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1298,41 +1283,31 @@ declare <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64>, i1)
 define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-LABEL: cttz_nxv2i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vsub.vx v10, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsll.vi v12, v10, 2
+; RV32I-NEXT:    vxor.vv v12, v10, v12
+; RV32I-NEXT:    vadd.vv v14, v12, v12
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v16, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v14, v16, v14
+; RV32I-NEXT:    vsub.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v14, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v14, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsrl.vi v12, v10, 3
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1340,38 +1315,33 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64I-LABEL: cttz_nxv2i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64I-NEXT:    vsub.vx v10, v8, a0
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v10
-; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v10, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v10, v8
-; RV64I-NEXT:    vsrl.vi v10, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vmv.v.x v10, a0
+; RV64I-NEXT:    vsll.vi v12, v10, 2
+; RV64I-NEXT:    vxor.vx v12, v12, a0
+; RV64I-NEXT:    vadd.vv v14, v12, v12
+; RV64I-NEXT:    vxor.vv v14, v12, v14
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v16, v8, a1
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vv v14, v16, v14
+; RV64I-NEXT:    vsub.vv v8, v8, v14
+; RV64I-NEXT:    vand.vv v14, v8, v12
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vadd.vv v8, v14, v8
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v10, v10, 3
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1461,41 +1431,31 @@ declare <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64>, i1)
 define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-LABEL: cttz_nxv4i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vsub.vx v12, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsll.vi v16, v12, 2
+; RV32I-NEXT:    vxor.vv v16, v12, v16
+; RV32I-NEXT:    vadd.vv v20, v16, v16
+; RV32I-NEXT:    vxor.vv v20, v16, v20
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v24, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vsrl.vi v24, v8, 1
+; RV32I-NEXT:    vand.vv v20, v24, v20
+; RV32I-NEXT:    vsub.vv v8, v8, v20
+; RV32I-NEXT:    vand.vv v20, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vadd.vv v8, v20, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v16
 ; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsrl.vi v16, v12, 3
+; RV32I-NEXT:    vand.vv v12, v12, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1503,38 +1463,33 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64I-LABEL: cttz_nxv4i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64I-NEXT:    vsub.vx v12, v8, a0
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v12
-; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v12, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v12, v8
-; RV64I-NEXT:    vsrl.vi v12, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v12
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    vmv.v.x v12, a0
+; RV64I-NEXT:    vsll.vi v16, v12, 2
+; RV64I-NEXT:    vxor.vx v16, v16, a0
+; RV64I-NEXT:    vadd.vv v20, v16, v16
+; RV64I-NEXT:    vxor.vv v20, v16, v20
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v24, v8, a1
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v24
+; RV64I-NEXT:    vsrl.vi v24, v8, 1
+; RV64I-NEXT:    vand.vv v20, v24, v20
+; RV64I-NEXT:    vsub.vv v8, v8, v20
+; RV64I-NEXT:    vand.vv v20, v8, v16
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vadd.vv v8, v20, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v12, v12, 3
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v12
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1624,82 +1579,105 @@ declare <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64>, i1)
 define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-LABEL: cttz_nxv8i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vsub.vx v16, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v24, v8, v16
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    sub sp, sp, a0
+; RV32I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsll.vi v24, v16, 2
+; RV32I-NEXT:    vxor.vv v24, v16, v24
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v0, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v0
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT:    vadd.vv v0, v24, v24
+; RV32I-NEXT:    vxor.vv v0, v24, v0
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT:    vsrl.vi v0, v8, 1
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vand.vv v0, v0, v8
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vsub.vv v8, v8, v0
+; RV32I-NEXT:    vand.vv v0, v8, v24
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vadd.vv v8, v0, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsrl.vi v24, v16, 3
+; RV32I-NEXT:    vand.vv v16, v16, v24
 ; RV32I-NEXT:    vmul.vv v8, v8, v16
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add sp, sp, a0
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: cttz_nxv8i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    sub sp, sp, a0
+; RV64I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    vmv.v.x v16, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT:    vsll.vi v24, v16, 2
+; RV64I-NEXT:    vxor.vx v24, v24, a0
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v0, v8, a1
 ; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vand.vv v8, v8, v0
+; RV64I-NEXT:    vadd.vv v0, v24, v24
+; RV64I-NEXT:    vxor.vv v0, v24, v0
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vand.vv v16, v16, v0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vv v16, v8, v24
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v24
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT:    vsrl.vi v16, v16, 3
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v16
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add sp, sp, a0
+; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32F-LABEL: cttz_nxv8i64:
@@ -2813,41 +2791,31 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-LABEL: cttz_zero_undef_nxv1i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vsub.vx v9, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsll.vi v10, v9, 2
+; RV32I-NEXT:    vxor.vv v10, v9, v10
+; RV32I-NEXT:    vadd.vv v11, v10, v10
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v12, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v11, v12, v11
+; RV32I-NEXT:    vsub.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v11, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v11, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsrl.vi v10, v9, 3
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2855,38 +2823,33 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64I-LABEL: cttz_zero_undef_nxv1i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64I-NEXT:    vsub.vx v9, v8, a0
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v9
-; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v9, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v9, v8
-; RV64I-NEXT:    vsrl.vi v9, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v9
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vmv.v.x v9, a0
+; RV64I-NEXT:    vsll.vi v10, v9, 2
+; RV64I-NEXT:    vxor.vx v10, v10, a0
+; RV64I-NEXT:    vadd.vv v11, v10, v10
+; RV64I-NEXT:    vxor.vv v11, v10, v11
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v12, v8, a1
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vv v11, v12, v11
+; RV64I-NEXT:    vsub.vv v8, v8, v11
+; RV64I-NEXT:    vand.vv v11, v8, v10
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v10
+; RV64I-NEXT:    vadd.vv v8, v11, v8
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v9, v9, 3
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2933,41 +2896,31 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-LABEL: cttz_zero_undef_nxv2i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vsub.vx v10, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsll.vi v12, v10, 2
+; RV32I-NEXT:    vxor.vv v12, v10, v12
+; RV32I-NEXT:    vadd.vv v14, v12, v12
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v16, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v14, v16, v14
+; RV32I-NEXT:    vsub.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v14, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v14, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsrl.vi v12, v10, 3
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2975,38 +2928,33 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64I-LABEL: cttz_zero_undef_nxv2i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64I-NEXT:    vsub.vx v10, v8, a0
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v10
-; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v10, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v10, v8
-; RV64I-NEXT:    vsrl.vi v10, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vmv.v.x v10, a0
+; RV64I-NEXT:    vsll.vi v12, v10, 2
+; RV64I-NEXT:    vxor.vx v12, v12, a0
+; RV64I-NEXT:    vadd.vv v14, v12, v12
+; RV64I-NEXT:    vxor.vv v14, v12, v14
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v16, v8, a1
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vv v14, v16, v14
+; RV64I-NEXT:    vsub.vv v8, v8, v14
+; RV64I-NEXT:    vand.vv v14, v8, v12
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vadd.vv v8, v14, v8
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v10, v10, 3
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3053,41 +3001,31 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-LABEL: cttz_zero_undef_nxv4i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vsub.vx v12, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsll.vi v16, v12, 2
+; RV32I-NEXT:    vxor.vv v16, v12, v16
+; RV32I-NEXT:    vadd.vv v20, v16, v16
+; RV32I-NEXT:    vxor.vv v20, v16, v20
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v24, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vsrl.vi v24, v8, 1
+; RV32I-NEXT:    vand.vv v20, v24, v20
+; RV32I-NEXT:    vsub.vv v8, v8, v20
+; RV32I-NEXT:    vand.vv v20, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vadd.vv v8, v20, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v16
 ; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsrl.vi v16, v12, 3
+; RV32I-NEXT:    vand.vv v12, v12, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -3095,38 +3033,33 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64I-LABEL: cttz_zero_undef_nxv4i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64I-NEXT:    vsub.vx v12, v8, a0
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v12
-; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v12, a0
-; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    vadd.vv v8, v12, v8
-; RV64I-NEXT:    vsrl.vi v12, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v12
 ; RV64I-NEXT:    lui a0, 61681
 ; RV64I-NEXT:    addiw a0, a0, -241
 ; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    vmv.v.x v12, a0
+; RV64I-NEXT:    vsll.vi v16, v12, 2
+; RV64I-NEXT:    vxor.vx v16, v16, a0
+; RV64I-NEXT:    vadd.vv v20, v16, v16
+; RV64I-NEXT:    vxor.vv v20, v16, v20
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v24, v8, a1
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v24
+; RV64I-NEXT:    vsrl.vi v24, v8, 1
+; RV64I-NEXT:    vand.vv v20, v24, v20
+; RV64I-NEXT:    vsub.vv v8, v8, v20
+; RV64I-NEXT:    vand.vv v20, v8, v16
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vadd.vv v8, v20, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v12, v12, 3
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v12
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3173,82 +3106,105 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-LABEL: cttz_zero_undef_nxv8i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vsub.vx v16, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v24, v8, v16
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    sub sp, sp, a0
+; RV32I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsll.vi v24, v16, 2
+; RV32I-NEXT:    vxor.vv v24, v16, v24
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsub.vx v0, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v0
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT:    vadd.vv v0, v24, v24
+; RV32I-NEXT:    vxor.vv v0, v24, v0
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT:    vsrl.vi v0, v8, 1
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, sp, a0
+; RV32I-NEXT:    addi a0, a0, 16
+; RV32I-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vand.vv v0, v0, v8
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT:    vsub.vv v8, v8, v0
+; RV32I-NEXT:    vand.vv v0, v8, v24
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vadd.vv v8, v0, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 4112
-; RV32I-NEXT:    addi a0, a0, 257
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsrl.vi v24, v16, 3
+; RV32I-NEXT:    vand.vv v16, v16, v24
 ; RV32I-NEXT:    vmul.vv v8, v8, v16
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    csrr a0, vlenb
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add sp, sp, a0
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: cttz_zero_undef_nxv8i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    sub sp, sp, a0
+; RV64I-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    vmv.v.x v16, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64I-NEXT:    vsll.vi v24, v16, 2
+; RV64I-NEXT:    vxor.vx v24, v24, a0
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    vsub.vx v0, v8, a1
 ; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vand.vv v8, v8, v0
+; RV64I-NEXT:    vadd.vv v0, v24, v24
+; RV64I-NEXT:    vxor.vv v0, v24, v0
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vand.vv v16, v16, v0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vv v16, v8, v24
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vv v8, v8, v24
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    addi a1, sp, 16
+; RV64I-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64I-NEXT:    vsrl.vi v16, v16, 3
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vmul.vv v8, v8, v16
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    csrr a0, vlenb
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add sp, sp, a0
+; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; CHECK-F-LABEL: cttz_zero_undef_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 145ce6e917f962..071b76899e7523 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -1365,41 +1365,31 @@ declare <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64>, i1 immarg,
 define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv1i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT:    vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v12, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1407,38 +1397,34 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 ;
 ; RV64-LABEL: vp_cttz_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT:    vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v12, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT:    vand.vv v11, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1455,41 +1441,31 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv1i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v12, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1497,38 +1473,34 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_cttz_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a1
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v12, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1549,41 +1521,31 @@ declare <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64>, i1 immarg,
 define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1591,38 +1553,34 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 ;
 ; RV64-LABEL: vp_cttz_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT:    vand.vv v14, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1639,41 +1597,31 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v16, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1681,38 +1629,34 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_cttz_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a1
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v16, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1733,41 +1677,31 @@ declare <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64>, i1 immarg,
 define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v24, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v20, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1775,38 +1709,34 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 ;
 ; RV64-LABEL: vp_cttz_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v24, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT:    vand.vv v20, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1823,41 +1753,31 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2
+; RV32-NEXT:    vxor.vv v16, v12, v16
+; RV32-NEXT:    vadd.vv v20, v16, v16
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v24, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vand.vv v20, v24, v20
+; RV32-NEXT:    vsub.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v20, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1865,38 +1785,34 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_cttz_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    vsll.vi v16, v12, 2
+; RV64-NEXT:    vxor.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v20, v16, v16
+; RV64-NEXT:    vxor.vv v20, v16, v20
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v24, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v20, v24, v20
+; RV64-NEXT:    vsub.vv v8, v8, v20
+; RV64-NEXT:    vand.vv v20, v8, v16
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v20, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v12, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1917,82 +1833,182 @@ declare <vscale x 7 x i64> @llvm.vp.cttz.nxv7i64(<vscale x 7 x i64>, i1 immarg,
 define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv7i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv7i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64:
@@ -2007,82 +2023,106 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
 define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv7i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v0, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v0, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64_unmasked:
@@ -2101,82 +2141,182 @@ declare <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64>, i1 immarg,
 define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64:
@@ -2191,82 +2331,106 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_nxv8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v0, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v0, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64_unmasked:
@@ -2288,13 +2452,19 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2305,120 +2475,114 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a2, 1
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
@@ -2427,107 +2591,113 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2538,89 +2708,235 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a1, a2, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
+; RV64-NEXT:    sub a1, a0, a2
+; RV64-NEXT:    sltu a3, a0, a1
 ; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
-; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    and a3, a3, a1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 40
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    addiw a3, a3, 1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    addiw a4, a4, 819
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v16, a4, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    li a3, 1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v8, v16, a3, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vnot.v v8, v16, v0.t
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    addiw a5, a5, -241
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addiw a6, a6, 257
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
-; RV64-NEXT:    addi t0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
-; RV64-NEXT:    bltu a0, a1, .LBB46_2
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 40
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a4, v0.t
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 4
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 16
+; RV64-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV64-NEXT:    bltu a0, a2, .LBB46_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a0, a2
 ; RV64-NEXT:  .LBB46_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsub.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v8, v16, a3, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vnot.v v8, v16, v0.t
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
-; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -2656,12 +2972,12 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2669,62 +2985,53 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    li a2, 1
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2
+; RV32-NEXT:    vsub.vx v0, v16, a2
 ; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vand.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v0
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vmul.vv v16, v16, v8
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a3
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsrl.vx v8, v8, a3
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB47_2
 ; RV32-NEXT:  # %bb.1:
@@ -2732,45 +3039,52 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:  .LBB47_2:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v16, v24, a2
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 1
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v16, a2
+; RV32-NEXT:    vnot.v v0, v16
+; RV32-NEXT:    vand.vv v16, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v8, v0, v0
+; RV32-NEXT:    vxor.vv v8, v0, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vsub.vv v16, v16, v24
-; RV32-NEXT:    vand.vv v24, v16, v0
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v8, v8, a3
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vx v8, v8, a3
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2778,65 +3092,109 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ;
 ; RV64-LABEL: vp_cttz_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    sub a1, a0, a2
+; RV64-NEXT:    sltu a3, a0, a1
 ; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
-; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    and a3, a3, a1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v16, a2
+; RV64-NEXT:    li a3, 1
+; RV64-NEXT:    vsub.vx v0, v16, a3
 ; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vand.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    addiw a3, a3, 1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    addiw a4, a4, 819
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v24, v16, a4
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    addiw a5, a5, -241
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addiw a6, a6, 257
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v16, v16, a6
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    vsrl.vx v16, v16, a7
-; RV64-NEXT:    bltu a0, a1, .LBB47_2
+; RV64-NEXT:    vand.vv v16, v16, v0
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    vsrl.vi v8, v16, 1
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v16, v8, v0
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv8r.v v0, v16
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a4
+; RV64-NEXT:    addi a5, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV64-NEXT:    bltu a0, a2, .LBB47_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a0, a2
 ; RV64-NEXT:  .LBB47_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v8, v16, a3
+; RV64-NEXT:    vnot.v v24, v16
+; RV64-NEXT:    vand.vv v8, v24, v8
+; RV64-NEXT:    vsll.vi v24, v0, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v16, v24, v0
+; RV64-NEXT:    vsrl.vi v0, v8, 1
+; RV64-NEXT:    vand.vv v16, v0, v16
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v8, v8, a5
-; RV64-NEXT:    vmul.vx v8, v8, a6
-; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vx v8, v8, a4
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 36f22bd3259cf9..8ab3ee6206de73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -913,35 +913,26 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    vsll.vi v12, v11, 1, v0.t
+; RV32-NEXT:    vxor.vv v12, v11, v12, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v11, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v9, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v10, v9, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -960,37 +951,34 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vsll.vi v11, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v11, v11, a1, v0.t
+; RV64-NEXT:    vsll.vi v12, v11, 1, v0.t
+; RV64-NEXT:    vxor.vv v12, v11, v12, v0.t
+; RV64-NEXT:    vand.vv v9, v9, v12, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v9, v8, v11, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1017,35 +1005,26 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsll.vi v11, v10, 2
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    vadd.vv v12, v11, v11
+; RV32-NEXT:    vxor.vv v12, v11, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v11
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vadd.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v9, v10, 3
+; RV32-NEXT:    vand.vv v9, v10, v9
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1064,37 +1043,34 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v9, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vsll.vi v11, v10, 2
+; RV64-NEXT:    vxor.vx v11, v11, a1
+; RV64-NEXT:    vadd.vv v12, v11, v11
+; RV64-NEXT:    vxor.vv v12, v11, v12
+; RV64-NEXT:    vand.vv v9, v9, v12
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vv v9, v8, v11
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v11
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v10, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1125,35 +1101,26 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    vsll.vi v16, v14, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v14, v16, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v14, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v10, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1172,37 +1139,34 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vsll.vi v14, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v14, v14, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v14, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v14, v16, v0.t
+; RV64-NEXT:    vand.vv v10, v10, v16, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v10, v8, v14, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1229,35 +1193,26 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsll.vi v14, v12, 2
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    vadd.vv v16, v14, v14
+; RV32-NEXT:    vxor.vv v16, v14, v16
+; RV32-NEXT:    vand.vv v10, v10, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v14
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v10, v12, 3
+; RV32-NEXT:    vand.vv v10, v12, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1276,37 +1231,34 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vsll.vi v14, v12, 2
+; RV64-NEXT:    vxor.vx v14, v14, a1
+; RV64-NEXT:    vadd.vv v16, v14, v14
+; RV64-NEXT:    vxor.vv v16, v14, v16
+; RV64-NEXT:    vand.vv v10, v10, v16
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vv v10, v8, v14
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v14
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v12, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1335,38 +1287,29 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 1, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsll.vi v20, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v20, v8, v20, v0.t
+; RV32-NEXT:    vsll.vi v24, v20, 1, v0.t
+; RV32-NEXT:    vxor.vv v24, v20, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v20, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v20, v0.t
+; RV32-NEXT:    vadd.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 4, v0.t
+; RV32-NEXT:    vadd.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    ret
@@ -1384,37 +1327,34 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vnot.v v12, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v12, 1, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    vsll.vi v20, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v20, v20, a1, v0.t
+; RV64-NEXT:    vsll.vi v24, v20, 1, v0.t
+; RV64-NEXT:    vxor.vv v24, v20, v24, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v12, v20, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV64-NEXT:    vand.vv v12, v12, v20, v0.t
+; RV64-NEXT:    vadd.vv v12, v16, v12, v0.t
+; RV64-NEXT:    vsrl.vi v16, v12, 4, v0.t
+; RV64-NEXT:    vadd.vv v12, v12, v16, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1441,35 +1381,26 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsll.vi v20, v16, 2
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    vadd.vv v24, v20, v20
+; RV32-NEXT:    vxor.vv v24, v20, v24
+; RV32-NEXT:    vand.vv v12, v12, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v20
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v12, v16, 3
+; RV32-NEXT:    vand.vv v12, v16, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1488,37 +1419,34 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v12, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    vsll.vi v20, v16, 2
+; RV64-NEXT:    vxor.vx v20, v20, a1
+; RV64-NEXT:    vadd.vv v24, v20, v20
+; RV64-NEXT:    vxor.vv v24, v20, v24
+; RV64-NEXT:    vand.vv v12, v12, v24
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vv v12, v8, v20
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v20
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v16, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1533,24 +1461,13 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
 define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1566,40 +1483,80 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v15i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1611,39 +1568,65 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
@@ -1652,24 +1635,12 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -1685,40 +1656,58 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vsll.vi v0, v8, 2
+; RV32-NEXT:    vxor.vv v0, v8, v0
+; RV32-NEXT:    vadd.vv v24, v0, v0
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 3
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v8, v16, v8
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -1730,39 +1719,54 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vnot.v v16, v8
+; RV64-NEXT:    vsrl.vi v8, v16, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v24, v8, v24
+; RV64-NEXT:    vsub.vv v16, v16, v24
+; RV64-NEXT:    vand.vv v24, v16, v0
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vv v16, v16, v0
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 3
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vmul.vv v8, v16, v8
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <15 x i1> poison, i1 true, i32 0
   %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -1775,24 +1779,13 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
 define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1808,40 +1801,80 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_ctlz_v16i64:
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vp_ctlz_v16i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1853,39 +1886,65 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
@@ -1894,24 +1953,12 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -1927,40 +1974,58 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vsll.vi v0, v8, 2
+; RV32-NEXT:    vxor.vv v0, v8, v0
+; RV32-NEXT:    vadd.vv v24, v0, v0
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 3
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v8, v16, v8
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -1972,39 +2037,54 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vnot.v v16, v8
+; RV64-NEXT:    vsrl.vi v8, v16, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v24, v8, v24
+; RV64-NEXT:    vsub.vv v16, v16, v24
+; RV64-NEXT:    vand.vv v24, v16, v0
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vv v16, v16, v0
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 3
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vmul.vv v8, v16, v8
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <16 x i1> poison, i1 true, i32 0
   %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -2017,39 +2097,24 @@ declare <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
 define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB34_2:
@@ -2069,113 +2134,114 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -2184,9 +2250,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -2201,81 +2268,119 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v32i64:
@@ -2283,23 +2388,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB34_2
+; RV64-NEXT:    mv a3, a0
+; RV64-NEXT:    bltu a0, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
@@ -2310,81 +2417,238 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 40
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a0, a0, a4
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsll.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -2395,160 +2659,170 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v16
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vor.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vx v0, v16, a2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vnot.v v8, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v32i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB35_2
-; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    bltu a0, a1, .LBB35_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB35_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
@@ -2559,69 +2833,117 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v24, v8, a1
+; RV64-NEXT:    li a3, 32
+; RV64-NEXT:    vsrl.vx v24, v8, a3
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v16, 2
+; RV64-NEXT:    vmv8r.v v24, v16
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v16, v0, v0
+; RV64-NEXT:    vxor.vv v16, v0, v16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v16, v8, v0
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v16, v24, 3
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a2
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 16
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vx v24, v16, a1
-; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vor.vv v16, v8, v16
+; RV64-NEXT:    vsrl.vi v0, v16, 2
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 4
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 8
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 16
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vx v0, v16, a3
+; RV64-NEXT:    vor.vv v16, v16, v0
 ; RV64-NEXT:    vnot.v v16, v16
+; RV64-NEXT:    vsll.vi v0, v24, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v8, v0, v24
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vv v8, v24, v8
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v16, v8, v0
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vx v16, v8, a2
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <32 x i1> poison, i1 true, i32 0
   %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
@@ -3512,35 +3834,26 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    vsll.vi v12, v11, 1, v0.t
+; RV32-NEXT:    vxor.vv v12, v11, v12, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v11, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v9, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v10, v9, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -3559,37 +3872,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vsll.vi v11, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v11, v11, a1, v0.t
+; RV64-NEXT:    vsll.vi v12, v11, 1, v0.t
+; RV64-NEXT:    vxor.vv v12, v11, v12, v0.t
+; RV64-NEXT:    vand.vv v9, v9, v12, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v9, v8, v11, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -3616,35 +3926,26 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsll.vi v11, v10, 2
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    vadd.vv v12, v11, v11
+; RV32-NEXT:    vxor.vv v12, v11, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v11
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vadd.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v9, v10, 3
+; RV32-NEXT:    vand.vv v9, v10, v9
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3663,37 +3964,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v9, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vsll.vi v11, v10, 2
+; RV64-NEXT:    vxor.vx v11, v11, a1
+; RV64-NEXT:    vadd.vv v12, v11, v11
+; RV64-NEXT:    vxor.vv v12, v11, v12
+; RV64-NEXT:    vand.vv v9, v9, v12
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vv v9, v8, v11
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v11
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v10, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3722,35 +4020,26 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    vsll.vi v16, v14, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v14, v16, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v14, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v10, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -3769,37 +4058,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vsll.vi v14, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v14, v14, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v14, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v14, v16, v0.t
+; RV64-NEXT:    vand.vv v10, v10, v16, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v10, v8, v14, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -3826,35 +4112,26 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsll.vi v14, v12, 2
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    vadd.vv v16, v14, v14
+; RV32-NEXT:    vxor.vv v16, v14, v16
+; RV32-NEXT:    vand.vv v10, v10, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v14
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v10, v12, 3
+; RV32-NEXT:    vand.vv v10, v12, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3873,37 +4150,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vsll.vi v14, v12, 2
+; RV64-NEXT:    vxor.vx v14, v14, a1
+; RV64-NEXT:    vadd.vv v16, v14, v14
+; RV64-NEXT:    vxor.vv v16, v14, v16
+; RV64-NEXT:    vand.vv v10, v10, v16
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vv v10, v8, v14
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v14
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v12, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3930,38 +4204,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 1, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsll.vi v20, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v20, v8, v20, v0.t
+; RV32-NEXT:    vsll.vi v24, v20, 1, v0.t
+; RV32-NEXT:    vxor.vv v24, v20, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v20, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v20, v0.t
+; RV32-NEXT:    vadd.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 4, v0.t
+; RV32-NEXT:    vadd.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    ret
@@ -3979,37 +4244,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vnot.v v12, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v12, 1, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    vsll.vi v20, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v20, v20, a1, v0.t
+; RV64-NEXT:    vsll.vi v24, v20, 1, v0.t
+; RV64-NEXT:    vxor.vv v24, v20, v24, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v12, v20, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV64-NEXT:    vand.vv v12, v12, v20, v0.t
+; RV64-NEXT:    vadd.vv v12, v16, v12, v0.t
+; RV64-NEXT:    vsrl.vi v16, v12, 4, v0.t
+; RV64-NEXT:    vadd.vv v12, v12, v16, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -4036,35 +4298,26 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsll.vi v20, v16, 2
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    vadd.vv v24, v20, v20
+; RV32-NEXT:    vxor.vv v24, v20, v24
+; RV32-NEXT:    vand.vv v12, v12, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v20
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v12, v16, 3
+; RV32-NEXT:    vand.vv v12, v16, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4083,37 +4336,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v12, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    vsll.vi v20, v16, 2
+; RV64-NEXT:    vxor.vx v20, v20, a1
+; RV64-NEXT:    vadd.vv v24, v20, v20
+; RV64-NEXT:    vxor.vv v24, v20, v24
+; RV64-NEXT:    vand.vv v12, v12, v24
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vv v12, v8, v20
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v20
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v16, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4126,24 +4376,13 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4159,40 +4398,80 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v15i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4204,39 +4483,65 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
@@ -4245,24 +4550,12 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -4278,40 +4571,58 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vsll.vi v0, v8, 2
+; RV32-NEXT:    vxor.vv v0, v8, v0
+; RV32-NEXT:    vadd.vv v24, v0, v0
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 3
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v8, v16, v8
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -4323,39 +4634,54 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vnot.v v16, v8
+; RV64-NEXT:    vsrl.vi v8, v16, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v24, v8, v24
+; RV64-NEXT:    vsub.vv v16, v16, v24
+; RV64-NEXT:    vand.vv v24, v16, v0
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vv v16, v16, v0
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 3
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vmul.vv v8, v16, v8
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <15 x i1> poison, i1 true, i32 0
   %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -4366,24 +4692,13 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4399,40 +4714,80 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v16i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4444,39 +4799,65 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
@@ -4485,24 +4866,12 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -4518,40 +4887,58 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vsll.vi v0, v8, 2
+; RV32-NEXT:    vxor.vv v0, v8, v0
+; RV32-NEXT:    vadd.vv v24, v0, v0
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 3
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v8, v16, v8
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -4563,39 +4950,54 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a1
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vnot.v v16, v8
+; RV64-NEXT:    vsrl.vi v8, v16, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v24, v8, v24
+; RV64-NEXT:    vsub.vv v16, v16, v24
+; RV64-NEXT:    vand.vv v24, v16, v0
+; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vv v16, v16, v0
+; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 3
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vmul.vv v8, v16, v8
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <16 x i1> poison, i1 true, i32 0
   %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -4606,39 +5008,24 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB70_2:
@@ -4658,113 +5045,114 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -4773,9 +5161,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -4790,81 +5179,119 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v32i64:
@@ -4872,23 +5299,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB70_2
+; RV64-NEXT:    mv a3, a0
+; RV64-NEXT:    bltu a0, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:  .LBB70_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
@@ -4899,81 +5328,238 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 40
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a0, a0, a4
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsll.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -4984,160 +5570,170 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v16
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vor.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vx v0, v16, a2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vnot.v v8, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB71_2
-; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    bltu a0, a1, .LBB71_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB71_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
@@ -5148,69 +5744,117 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v24, v8, a1
+; RV64-NEXT:    li a3, 32
+; RV64-NEXT:    vsrl.vx v24, v8, a3
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v16, 2
+; RV64-NEXT:    vmv8r.v v24, v16
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v16, v0, v0
+; RV64-NEXT:    vxor.vv v16, v0, v16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v16, v8, v0
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v16, v24, 3
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a2
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 16
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vx v24, v16, a1
-; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vor.vv v16, v8, v16
+; RV64-NEXT:    vsrl.vi v0, v16, 2
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 4
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 8
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 16
+; RV64-NEXT:    vor.vv v16, v16, v0
+; RV64-NEXT:    vsrl.vx v0, v16, a3
+; RV64-NEXT:    vor.vv v16, v16, v0
 ; RV64-NEXT:    vnot.v v16, v16
+; RV64-NEXT:    vsll.vi v0, v24, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v8, v0, v24
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vv v8, v24, v8
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v16, v8, v0
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vx v16, v8, a2
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <32 x i1> poison, i1 true, i32 0
   %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 277146cc1403e9..33ac13ffc2cac8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -262,35 +262,26 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
+; RV32I-NEXT:    vsll.vi v11, v10, 2
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    vadd.vv v12, v11, v11
+; RV32I-NEXT:    vxor.vv v12, v11, v12
+; RV32I-NEXT:    vand.vv v9, v9, v12
 ; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v11
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vand.vv v8, v8, v11
+; RV32I-NEXT:    vadd.vv v8, v9, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v9, v10, 3
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -316,32 +307,29 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v9, a1
+; RV64I-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vmv.v.x v10, a1
+; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT:    vsll.vi v11, v10, 2
+; RV64I-NEXT:    vxor.vx v11, v11, a1
+; RV64I-NEXT:    vadd.vv v12, v11, v11
+; RV64I-NEXT:    vxor.vv v12, v11, v12
+; RV64I-NEXT:    vand.vv v9, v9, v12
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vv v9, v8, v11
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vv v8, v8, v11
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v9, v10, 3
+; RV64I-NEXT:    vand.vx v9, v9, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -671,35 +659,26 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
+; RV32I-NEXT:    vsll.vi v14, v12, 2
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    vadd.vv v16, v14, v14
+; RV32I-NEXT:    vxor.vv v16, v14, v16
+; RV32I-NEXT:    vand.vv v10, v10, v16
 ; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v14
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vand.vv v8, v8, v14
+; RV32I-NEXT:    vadd.vv v8, v10, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v10, v12, 3
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -725,32 +704,29 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v10, a1
+; RV64I-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vmv.v.x v12, a1
+; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT:    vsll.vi v14, v12, 2
+; RV64I-NEXT:    vxor.vx v14, v14, a1
+; RV64I-NEXT:    vadd.vv v16, v14, v14
+; RV64I-NEXT:    vxor.vv v16, v14, v16
+; RV64I-NEXT:    vand.vv v10, v10, v16
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vv v10, v8, v14
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vv v8, v8, v14
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v10, v12, 3
+; RV64I-NEXT:    vand.vx v10, v10, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1061,35 +1037,26 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
+; RV32I-NEXT:    vsll.vi v11, v10, 2
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    vadd.vv v12, v11, v11
+; RV32I-NEXT:    vxor.vv v12, v11, v12
+; RV32I-NEXT:    vand.vv v9, v9, v12
 ; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v11
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vand.vv v8, v8, v11
+; RV32I-NEXT:    vadd.vv v8, v9, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v9, v10, 3
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1115,32 +1082,29 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v9, a1
+; RV64I-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vmv.v.x v10, a1
+; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT:    vsll.vi v11, v10, 2
+; RV64I-NEXT:    vxor.vx v11, v11, a1
+; RV64I-NEXT:    vadd.vv v12, v11, v11
+; RV64I-NEXT:    vxor.vv v12, v11, v12
+; RV64I-NEXT:    vand.vv v9, v9, v12
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vv v9, v8, v11
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vv v8, v8, v11
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v9, v10, 3
+; RV64I-NEXT:    vand.vx v9, v9, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1446,35 +1410,26 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
+; RV32I-NEXT:    vsll.vi v14, v12, 2
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    vadd.vv v16, v14, v14
+; RV32I-NEXT:    vxor.vv v16, v14, v16
+; RV32I-NEXT:    vand.vv v10, v10, v16
 ; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v14
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vand.vv v8, v8, v14
+; RV32I-NEXT:    vadd.vv v8, v10, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 4
 ; RV32I-NEXT:    vadd.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v10, v12, 3
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1500,32 +1455,29 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v10, a1
+; RV64I-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vmv.v.x v12, a1
+; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT:    vsll.vi v14, v12, 2
+; RV64I-NEXT:    vxor.vx v14, v14, a1
+; RV64I-NEXT:    vadd.vv v16, v14, v14
+; RV64I-NEXT:    vxor.vv v16, v14, v16
+; RV64I-NEXT:    vand.vv v10, v10, v16
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vv v10, v8, v14
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vv v8, v8, v14
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v10, v12, 3
+; RV64I-NEXT:    vand.vx v10, v10, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index c4b22955f84c4f..7e1e94cbed3755 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -681,37 +681,27 @@ declare <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64>, <2 x i1>, i32)
 define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT:    vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -719,34 +709,30 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v2i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT:    vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT:    vand.vv v11, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -757,37 +743,27 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -795,34 +771,30 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a1
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -837,37 +809,27 @@ declare <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64>, <4 x i1>, i32)
 define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -875,34 +837,30 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT:    vand.vv v14, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -913,37 +871,27 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -951,34 +899,30 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a1
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -993,37 +937,27 @@ declare <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64>, <8 x i1>, i32)
 define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v20, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1031,34 +965,30 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT:    vand.vv v20, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1069,37 +999,27 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2
+; RV32-NEXT:    vxor.vv v16, v12, v16
+; RV32-NEXT:    vadd.vv v20, v16, v16
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vand.vv v20, v24, v20
+; RV32-NEXT:    vsub.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v20, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1107,34 +1027,30 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vsll.vi v16, v12, 2
+; RV64-NEXT:    vxor.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v20, v16, v16
+; RV64-NEXT:    vxor.vv v20, v16, v20
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v20, v24, v20
+; RV64-NEXT:    vsub.vv v8, v8, v20
+; RV64-NEXT:    vand.vv v20, v8, v16
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v20, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v12, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1149,89 +1065,163 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32)
 define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v15i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
@@ -1240,91 +1230,100 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
-  %head = insertelement <15 x i1> poison, i1 true, i32 0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %head = insertelement <15 x i1> poison, i1 true, i32 0
   %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
   %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
@@ -1335,89 +1334,163 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32)
 define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v16i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsll.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v24, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v24, v24, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
@@ -1426,89 +1499,98 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <16 x i1> poison, i1 true, i32 0
   %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -1521,117 +1603,151 @@ declare <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a2, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a2
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 3, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
@@ -1639,52 +1755,102 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v32i64:
@@ -1692,81 +1858,263 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB34_2
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    bltu a0, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    addiw a2, a2, 819
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    addiw a3, a3, -241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addiw a4, a4, 257
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    addi a6, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a1, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    vsrl.vx v16, v16, a2, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, a0, -16
+; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a2, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -1777,190 +2125,223 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a2, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v16, v24, v0
+; RV32-NEXT:    vadd.vv v0, v16, v16
+; RV32-NEXT:    vxor.vv v8, v16, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v0, v8
+; RV32-NEXT:    vand.vv v0, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a1
+; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v16, v24, 2
+; RV32-NEXT:    vxor.vv v16, v24, v16
+; RV32-NEXT:    vadd.vv v0, v16, v16
+; RV32-NEXT:    vxor.vv v8, v16, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v16, v8, v16
-; RV32-NEXT:    vand.vv v8, v16, v0
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v0, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v0, v8
+; RV32-NEXT:    vand.vv v0, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v32i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB35_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB35_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    addiw a2, a2, -241
 ; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v8, a2
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a2
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsll.vi v0, v8, 2
+; RV64-NEXT:    vxor.vx v0, v0, a2
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v8, v0, v24
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v24, v16, 1
+; RV64-NEXT:    vand.vv v8, v24, v8
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v24, v8, v0
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vv v8, v8, v0
 ; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    addiw a3, a3, -241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addiw a4, a4, 257
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a5
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v24, v16, 3
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vmul.vv v8, v8, v24
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a1
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, a0, -16
+; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a2
-; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a2
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v8, v24, v0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v0, v16, 1
+; RV64-NEXT:    vand.vv v8, v0, v8
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v0, v8, v24
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
 ; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a5
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vx v16, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <32 x i1> poison, i1 true, i32 0
   %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index b5114bbe491896..5cdcaf226d856f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -129,36 +129,27 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vand.vv v9, v10, v9
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
@@ -169,33 +160,30 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
 ; RV64-NEXT:    lui a1, 61681
 ; RV64-NEXT:    addiw a1, a1, -241
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a1
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    lui a1, 4112
-; RV64-NEXT:    addiw a1, a1, 257
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a1
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -435,36 +423,27 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
@@ -475,33 +454,30 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    lui a1, 61681
 ; RV64-NEXT:    addiw a1, a1, -241
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a1
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
 ; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    lui a1, 4112
-; RV64-NEXT:    addiw a1, a1, 257
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a1
 ; RV64-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 49f6ffd691292a..4a17552fc137ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -777,41 +777,31 @@ declare <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32)
 define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT:    vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v12, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -819,38 +809,34 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_cttz_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT:    vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v12, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT:    vand.vv v11, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -861,41 +847,31 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v12, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -903,38 +879,34 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_cttz_v2i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a1
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v12, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -949,41 +921,31 @@ declare <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32)
 define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -991,38 +953,34 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_cttz_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT:    vand.vv v14, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1033,41 +991,31 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v16, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1075,38 +1023,34 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_cttz_v4i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a1
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v16, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1121,41 +1065,31 @@ declare <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32)
 define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v24, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v20, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -1163,38 +1097,34 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_cttz_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v24, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT:    vand.vv v20, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -1205,41 +1135,31 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2
+; RV32-NEXT:    vxor.vv v16, v12, v16
+; RV32-NEXT:    vadd.vv v20, v16, v16
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v24, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vand.vv v20, v24, v20
+; RV32-NEXT:    vsub.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v20, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1247,38 +1167,34 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_cttz_v8i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    vsll.vi v16, v12, 2
+; RV64-NEXT:    vxor.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v20, v16, v16
+; RV64-NEXT:    vxor.vv v20, v16, v20
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v24, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v20, v24, v20
+; RV64-NEXT:    vsub.vv v8, v8, v20
+; RV64-NEXT:    vand.vv v20, v8, v16
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v20, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v12, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1293,196 +1209,290 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
 define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v15i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    ret
-  %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
-  ret <15 x i64> %v
-}
-
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
+  ret <15 x i64> %v
+}
+
 define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v0, v8, a0
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v15i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v0, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <15 x i1> poison, i1 true, i32 0
   %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -1495,97 +1505,182 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
 define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
@@ -1594,97 +1689,106 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v0, v8, a0
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v16i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v0, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <16 x i1> poison, i1 true, i32 0
   %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -1697,155 +1801,139 @@ declare <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
 define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -1854,88 +1942,116 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v32i64:
@@ -1943,14 +2059,21 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
@@ -1959,72 +2082,231 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB34_2:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 5
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vnot.v v8, v16, v0.t
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v16, v16, a3, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vnot.v v16, v8, v0.t
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -2035,193 +2317,233 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v16
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
-; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a2
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsub.vx v0, v8, a1
 ; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v8, v0, v0
+; RV32-NEXT:    vxor.vv v8, v0, v8
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
 ; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v24, a2
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vand.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a1
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vand.vv v8, v0, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v32i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB35_2
-; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    bltu a0, a1, .LBB35_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB35_2:
-; RV64-NEXT:    li a2, 1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
 ; RV64-NEXT:    slli a3, a1, 32
 ; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    vsub.vx v0, v8, a2
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vsll.vi v0, v16, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v16, v8, v0
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v0, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a3
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v16, a2
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vand.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a2
+; RV64-NEXT:    vnot.v v24, v8
+; RV64-NEXT:    vand.vv v16, v24, v16
+; RV64-NEXT:    vsll.vi v24, v0, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v8, v24, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 1
+; RV64-NEXT:    vand.vv v8, v0, v8
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v16, v8, v24
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <32 x i1> poison, i1 true, i32 0
   %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
@@ -2976,41 +3298,31 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV32-NEXT:    vxor.vv v10, v9, v10, v0.t
+; RV32-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV32-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v12, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -3018,38 +3330,34 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v10, v9, 2, v0.t
+; RV64-NEXT:    vxor.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vsll.vi v11, v10, 1, v0.t
+; RV64-NEXT:    vxor.vv v11, v10, v11, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v12, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v11, v12, v11, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v11, v0.t
+; RV64-NEXT:    vand.vv v11, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vadd.vv v8, v11, v8, v0.t
+; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v9, v9, 3, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -3060,41 +3368,31 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    vxor.vv v10, v9, v10
+; RV32-NEXT:    vadd.vv v11, v10, v10
+; RV32-NEXT:    vxor.vv v11, v10, v11
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v12, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v11, v12, v11
+; RV32-NEXT:    vsub.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v11, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v9, 3
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3102,38 +3400,34 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    vxor.vx v10, v10, a1
+; RV64-NEXT:    vadd.vv v11, v10, v10
+; RV64-NEXT:    vxor.vv v11, v10, v11
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v12, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    vand.vv v11, v12, v11
+; RV64-NEXT:    vsub.vv v8, v8, v11
+; RV64-NEXT:    vand.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v11, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v9, v9, 3
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3146,41 +3440,31 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV32-NEXT:    vxor.vv v12, v10, v12, v0.t
+; RV32-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV32-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -3188,38 +3472,34 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v12, v10, 2, v0.t
+; RV64-NEXT:    vxor.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vsll.vi v14, v12, 1, v0.t
+; RV64-NEXT:    vxor.vv v14, v12, v14, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v14, v16, v14, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v14, v0.t
+; RV64-NEXT:    vand.vv v14, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vadd.vv v8, v14, v8, v0.t
+; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v10, v10, 3, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -3230,41 +3510,31 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    vxor.vv v12, v10, v12
+; RV32-NEXT:    vadd.vv v14, v12, v12
+; RV32-NEXT:    vxor.vv v14, v12, v14
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v16, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v14, v16, v14
+; RV32-NEXT:    vsub.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v14, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vadd.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v10, 3
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3272,38 +3542,34 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    vxor.vx v12, v12, a1
+; RV64-NEXT:    vadd.vv v14, v12, v12
+; RV64-NEXT:    vxor.vv v14, v12, v14
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v16, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v14, v16, v14
+; RV64-NEXT:    vsub.vv v8, v8, v14
+; RV64-NEXT:    vand.vv v14, v8, v12
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v12
+; RV64-NEXT:    vadd.vv v8, v14, v8
+; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v10, v10, 3
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3316,41 +3582,31 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV32-NEXT:    vxor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV32-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v24, v8, a0, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v20, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
@@ -3358,38 +3614,34 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v12, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v20, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v20, v16, v20, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v24, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vv v20, v24, v20, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v20, v0.t
+; RV64-NEXT:    vand.vv v20, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v20, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vsrl.vi v12, v12, 3, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
@@ -3400,41 +3652,31 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsll.vi v16, v12, 2
+; RV32-NEXT:    vxor.vv v16, v12, v16
+; RV32-NEXT:    vadd.vv v20, v16, v16
+; RV32-NEXT:    vxor.vv v20, v16, v20
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v24, v8, a0
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vand.vv v20, v24, v20
+; RV32-NEXT:    vsub.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v20, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v12, 3
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3442,38 +3684,34 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    vsll.vi v16, v12, 2
+; RV64-NEXT:    vxor.vx v16, v16, a1
+; RV64-NEXT:    vadd.vv v20, v16, v16
+; RV64-NEXT:    vxor.vv v20, v16, v20
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v24, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vv v20, v24, v20
+; RV64-NEXT:    vsub.vv v8, v8, v20
+; RV64-NEXT:    vand.vv v20, v8, v16
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v20, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vsrl.vi v12, v12, 3
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3486,97 +3724,182 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
   ret <15 x i64> %v
@@ -3585,97 +3908,106 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v0, v8, a0
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v0, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <15 x i1> poison, i1 true, i32 0
   %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer
@@ -3686,97 +4018,182 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v8, 2, v0.t
+; RV32-NEXT:    vxor.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vsll.vi v16, v24, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %v
@@ -3785,97 +4202,106 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -32
-; RV32-NEXT:    .cfi_def_cfa_offset 32
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
+; RV32-NEXT:    vsll.vi v24, v16, 2
+; RV32-NEXT:    vxor.vv v24, v16, v24
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    vsub.vx v0, v8, a0
 ; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vadd.vv v0, v24, v24
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 3
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    vsll.vi v24, v16, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsub.vx v0, v8, a0
 ; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v0, v24, v0
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vv v16, v16, v0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vv v16, v8, v24
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <16 x i1> poison, i1 true, i32 0
   %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
@@ -3886,155 +4312,139 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB70_2:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -4043,88 +4453,116 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 2, v0.t
+; RV32-NEXT:    vxor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV32-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v32i64:
@@ -4132,14 +4570,21 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 48
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
@@ -4148,72 +4593,231 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB70_2:
-; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
+; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 5
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vxor.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vnot.v v8, v16, v0.t
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v8, 3, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a1, v0.t
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v16, v16, a3, v0.t
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a4, 40
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsll.vi v16, v8, 2, v0.t
+; RV64-NEXT:    vxor.vx v16, v16, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vxor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a4, 24
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vnot.v v16, v8, v0.t
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -4224,193 +4828,233 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v16
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
-; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a2
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsub.vx v0, v8, a1
 ; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v8, v0, v0
+; RV32-NEXT:    vxor.vv v8, v0, v8
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
 ; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v24, a2
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vand.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vx v16, v8, a1
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vand.vv v8, v0, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vi v0, v24, 2
+; RV32-NEXT:    vxor.vv v0, v24, v0
+; RV32-NEXT:    vadd.vv v16, v0, v0
+; RV32-NEXT:    vxor.vv v16, v0, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v16, v24, 3
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB71_2
-; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV64-NEXT:    li a1, 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    bltu a0, a1, .LBB71_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB71_2:
-; RV64-NEXT:    li a2, 1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    addiw a1, a1, -241
 ; RV64-NEXT:    slli a3, a1, 32
 ; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    vsub.vx v0, v8, a2
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vsll.vi v0, v16, 2
+; RV64-NEXT:    vxor.vx v0, v0, a1
+; RV64-NEXT:    vadd.vv v24, v0, v0
+; RV64-NEXT:    vxor.vv v24, v0, v24
+; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v16, v8, v0
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v0, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a3
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a4, a0, -16
+; RV64-NEXT:    sltu a0, a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    and a0, a0, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v16, a2
-; RV64-NEXT:    vnot.v v16, v16
-; RV64-NEXT:    vand.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
-; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vi v24, v16, 4
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsub.vx v16, v8, a2
+; RV64-NEXT:    vnot.v v24, v8
+; RV64-NEXT:    vand.vv v16, v24, v16
+; RV64-NEXT:    vsll.vi v24, v0, 2
+; RV64-NEXT:    vxor.vx v24, v24, a1
+; RV64-NEXT:    vadd.vv v0, v24, v24
+; RV64-NEXT:    vxor.vv v8, v24, v0
+; RV64-NEXT:    vsrl.vi v0, v16, 1
+; RV64-NEXT:    vand.vv v8, v0, v8
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    vand.vv v16, v8, v24
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vadd.vv v8, v16, v8
+; RV64-NEXT:    vsrl.vi v16, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsrl.vi v16, v16, 3
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %head = insertelement <32 x i1> poison, i1 true, i32 0
   %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 8c8da6d1e00313..c778fab049005e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -250,40 +250,31 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v9, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vsll.vi v10, v9, 2
+; RV32I-NEXT:    vxor.vv v10, v9, v10
+; RV32I-NEXT:    vadd.vv v11, v10, v10
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    vsub.vx v12, v8, a1
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v11, v12, v11
+; RV32I-NEXT:    vsub.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v11, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v11, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vsrl.vi v10, v9, 3
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -294,37 +285,34 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v9, v8, a1
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v9
-; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v9, a1
-; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    vadd.vv v8, v9, v8
-; RV64I-NEXT:    vsrl.vi v9, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v9
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vmv.v.x v9, a1
+; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT:    vsll.vi v10, v9, 2
+; RV64I-NEXT:    vxor.vx v10, v10, a1
+; RV64I-NEXT:    vadd.vv v11, v10, v10
+; RV64I-NEXT:    vxor.vv v11, v10, v11
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    vsub.vx v12, v8, a2
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vv v11, v12, v11
+; RV64I-NEXT:    vsub.vv v8, v8, v11
+; RV64I-NEXT:    vand.vv v11, v8, v10
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v10
+; RV64I-NEXT:    vadd.vv v8, v11, v8
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v9, v9, 3
+; RV64I-NEXT:    vand.vx v9, v9, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -651,40 +639,31 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v10, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vsll.vi v12, v10, 2
+; RV32I-NEXT:    vxor.vv v12, v10, v12
+; RV32I-NEXT:    vadd.vv v14, v12, v12
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    vsub.vx v16, v8, a1
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v14, v16, v14
+; RV32I-NEXT:    vsub.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v14, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v14, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vsrl.vi v12, v10, 3
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -695,37 +674,34 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v10, v8, a1
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v10
-; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v10, a1
-; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    vadd.vv v8, v10, v8
-; RV64I-NEXT:    vsrl.vi v10, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vmv.v.x v10, a1
+; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT:    vsll.vi v12, v10, 2
+; RV64I-NEXT:    vxor.vx v12, v12, a1
+; RV64I-NEXT:    vadd.vv v14, v12, v12
+; RV64I-NEXT:    vxor.vv v14, v12, v14
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    vsub.vx v16, v8, a2
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vv v14, v16, v14
+; RV64I-NEXT:    vsub.vv v8, v8, v14
+; RV64I-NEXT:    vand.vv v14, v8, v12
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vadd.vv v8, v14, v8
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v10, v10, 3
+; RV64I-NEXT:    vand.vx v10, v10, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1029,40 +1005,31 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v9, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vsll.vi v10, v9, 2
+; RV32I-NEXT:    vxor.vv v10, v9, v10
+; RV32I-NEXT:    vadd.vv v11, v10, v10
+; RV32I-NEXT:    vxor.vv v11, v10, v11
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    vsub.vx v12, v8, a1
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v11, v12, v11
+; RV32I-NEXT:    vsub.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v11, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v11, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vsrl.vi v10, v9, 3
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1073,37 +1040,34 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v9, v8, a1
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v9
-; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v9, a1
-; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    vadd.vv v8, v9, v8
-; RV64I-NEXT:    vsrl.vi v9, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v9
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vmv.v.x v9, a1
+; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64I-NEXT:    vsll.vi v10, v9, 2
+; RV64I-NEXT:    vxor.vx v10, v10, a1
+; RV64I-NEXT:    vadd.vv v11, v10, v10
+; RV64I-NEXT:    vxor.vv v11, v10, v11
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    vsub.vx v12, v8, a2
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vv v11, v12, v11
+; RV64I-NEXT:    vsub.vv v8, v8, v11
+; RV64I-NEXT:    vand.vv v11, v8, v10
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v10
+; RV64I-NEXT:    vadd.vv v8, v11, v8
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v9, v9, 3
+; RV64I-NEXT:    vand.vx v9, v9, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v9
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1400,40 +1364,31 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v10, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi a1, a1, 1365
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vsll.vi v12, v10, 2
+; RV32I-NEXT:    vxor.vv v12, v10, v12
+; RV32I-NEXT:    vadd.vv v14, v12, v12
+; RV32I-NEXT:    vxor.vv v14, v12, v14
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    vsub.vx v16, v8, a1
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v14, v16, v14
+; RV32I-NEXT:    vsub.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v14, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v14, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    lui a1, 4112
-; RV32I-NEXT:    addi a1, a1, 257
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vsrl.vi v12, v10, 3
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1444,37 +1399,34 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v10, v8, a1
-; RV64I-NEXT:    vnot.v v8, v8
-; RV64I-NEXT:    vand.vv v8, v8, v10
-; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v10, a1
-; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
-; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    vadd.vv v8, v10, v8
-; RV64I-NEXT:    vsrl.vi v10, v8, 4
-; RV64I-NEXT:    vadd.vv v8, v8, v10
 ; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    slli a2, a1, 32
 ; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vmv.v.x v10, a1
+; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64I-NEXT:    vsll.vi v12, v10, 2
+; RV64I-NEXT:    vxor.vx v12, v12, a1
+; RV64I-NEXT:    vadd.vv v14, v12, v12
+; RV64I-NEXT:    vxor.vv v14, v12, v14
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    vsub.vx v16, v8, a2
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vv v14, v16, v14
+; RV64I-NEXT:    vsub.vv v8, v8, v14
+; RV64I-NEXT:    vand.vv v14, v8, v12
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    vadd.vv v8, v14, v8
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
 ; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vsrl.vi v10, v10, 3
+; RV64I-NEXT:    vand.vx v10, v10, a1
+; RV64I-NEXT:    vmul.vv v8, v8, v10
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index f707cb31e3eced..0b788878a601e5 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -176,30 +176,34 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sraw a0, a0, a1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw s0, a1, 1365
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw s1, a1, 819
 ; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi s2, a1, -241
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addi s3, a1, 257
+; RV64I-NEXT:    addiw s0, a1, -241
+; RV64I-NEXT:    slli a1, s0, 32
+; RV64I-NEXT:    add s0, s0, a1
+; RV64I-NEXT:    slli s1, s0, 2
+; RV64I-NEXT:    xor s1, s1, s0
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw s2, a1, 1365
+; RV64I-NEXT:    srli a1, s0, 3
+; RV64I-NEXT:    and s3, a1, s0
 ; RV64I-NEXT:  .LBB4_1: # %bb2
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    call bar
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    and a0, a0, s0
-; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    and a2, a0, s1
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, s1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, s2
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    srli a2, a2, 32
+; RV64I-NEXT:    sub a2, a2, a0
+; RV64I-NEXT:    and a0, a2, s1
+; RV64I-NEXT:    srli a2, a2, 2
+; RV64I-NEXT:    and a2, a2, s1
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    srli a2, a0, 4
 ; RV64I-NEXT:    add a0, a0, a2
-; RV64I-NEXT:    and a0, a0, s2
+; RV64I-NEXT:    and a0, a0, s0
 ; RV64I-NEXT:    mul a0, a0, s3
-; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    bnez a1, .LBB4_1
 ; RV64I-NEXT:  # %bb.2: # %bb7
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -323,27 +327,23 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sraw a0, a0, a1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw s0, a1, 1365
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw s0, a1, -241
 ; RV64I-NEXT:    slli a1, s0, 32
 ; RV64I-NEXT:    add s0, s0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw s1, a1, 819
-; RV64I-NEXT:    slli a1, s1, 32
-; RV64I-NEXT:    add s1, s1, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw s2, a1, -241
+; RV64I-NEXT:    slli s1, s0, 2
+; RV64I-NEXT:    xor s1, s1, s0
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw s2, a1, 1365
 ; RV64I-NEXT:    slli a1, s2, 32
 ; RV64I-NEXT:    add s2, s2, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw s3, a1, 257
-; RV64I-NEXT:    slli a1, s3, 32
-; RV64I-NEXT:    add s3, s3, a1
+; RV64I-NEXT:    srli a1, s0, 3
+; RV64I-NEXT:    and s3, a1, s0
 ; RV64I-NEXT:  .LBB6_1: # %bb2
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    call foo
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    and a1, a1, s0
+; RV64I-NEXT:    and a1, a1, s2
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    and a1, a0, s1
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -351,7 +351,7 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    and a0, a0, s2
+; RV64I-NEXT:    and a0, a0, s0
 ; RV64I-NEXT:    mul a0, a0, s3
 ; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    bnez a0, .LBB6_1



More information about the llvm-commits mailing list