[llvm] 299b1b4 - [SelectionDAG][RISCV] Teach getConstant to use SPLAT_VECTOR_PARTS if vXi64 SPLAT_VECTOR is legal but i64 scalars are not.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 29 09:23:12 PDT 2023


Author: Craig Topper
Date: 2023-08-29T09:22:17-07:00
New Revision: 299b1b407148c519f899a5d799b45c19c7484cb8

URL: https://github.com/llvm/llvm-project/commit/299b1b407148c519f899a5d799b45c19c7484cb8
DIFF: https://github.com/llvm/llvm-project/commit/299b1b407148c519f899a5d799b45c19c7484cb8.diff

LOG: [SelectionDAG][RISCV] Teach getConstant to use SPLAT_VECTOR_PARTS if vXi64 SPLAT_VECTOR is legal but i64 scalars are not.

That matches how such a SPLAT_VECTOR would have been type legalized
so assume it is ok to use for creating constants after type legalization.

Still need some improvements to SPLAT_VECTOR lowering.

This overlaps with some of what D158742 was trying to fix.

Reviewed By: luke

Differential Revision: https://reviews.llvm.org/D158870

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b8423d70d4c47c..9ad0d736a0fee0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1631,7 +1631,8 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
     unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
 
     // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node.
-    if (VT.isScalableVector()) {
+    if (VT.isScalableVector() ||
+        TLI->isOperationLegal(ISD::SPLAT_VECTOR, VT)) {
       assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 &&
              "Can only handle an even split!");
       unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index dfb5213e460fcb..2a92f30b851714 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1385,72 +1385,82 @@ declare <2 x i64> @llvm.vp.bitreverse.v2i64(<2 x i64>, <2 x i1>, i32)
 define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v9, v0
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v11, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v11, v11, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
-; RV32-NEXT:    vsrl.vi v11, v8, 8, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vand.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vsrl.vi v13, v8, 24, v0.t
+; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v13, v13, a4, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
-; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
-; RV32-NEXT:    vsll.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v13, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v13, v13, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
-; RV32-NEXT:    vand.vx v13, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v13, v13, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v13, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v11, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a1
+; RV32-NEXT:    vand.vx v10, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    mv a5, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a1
+; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a1
+; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v2i64:
@@ -1519,6 +1529,23 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e
 define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v2i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a1
@@ -1531,12 +1558,9 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v10, v8, a4
 ; RV32-NEXT:    vsll.vi v10, v10, 24
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
+; RV32-NEXT:    mv a5, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v11
 ; RV32-NEXT:    vsll.vi v12, v12, 8
@@ -1546,43 +1570,41 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    vsrl.vx v12, v8, a3
 ; RV32-NEXT:    vand.vx v12, v12, a2
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vand.vx v12, v12, a4
+; RV32-NEXT:    vsrl.vi v8, v8, 8
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v2i64_unmasked:
@@ -1655,73 +1677,82 @@ declare <4 x i64> @llvm.vp.bitreverse.v4i64(<4 x i64>, <4 x i1>, i32)
 define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v14, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v14, v14, a3, v0.t
-; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
-; RV32-NEXT:    vsrl.vi v14, v8, 8, v0.t
-; RV32-NEXT:    li a4, 85
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vand.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vsrl.vi v18, v8, 24, v0.t
+; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v18, v18, a4, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
-; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
-; RV32-NEXT:    vsll.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v18, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v18, v18, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
-; RV32-NEXT:    vand.vx v18, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v18, v18, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v18, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v14, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v14, a1
+; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
+; RV32-NEXT:    mv a5, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v14, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v14, a1
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v14, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v14, a1
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v14, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v4i64:
@@ -1790,6 +1821,23 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e
 define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v4i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a1
@@ -1802,13 +1850,9 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v12, v8, a4
 ; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    li a5, 85
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v14, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v14, v14, a5, v0
+; RV32-NEXT:    mv a5, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v14, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v14
 ; RV32-NEXT:    vsll.vi v16, v16, 8
@@ -1818,43 +1862,41 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    vsrl.vx v16, v8, a3
 ; RV32-NEXT:    vand.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    vsrl.vi v8, v8, 8
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v4i64_unmasked:
@@ -1927,74 +1969,82 @@ declare <8 x i64> @llvm.vp.bitreverse.v8i64(<8 x i64>, <8 x i1>, i32)
 define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v12, v0
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v20, v20, a3, v0.t
-; RV32-NEXT:    vor.vv v20, v20, v16, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    lui a4, 5
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsrl.vi v28, v8, 24, v0.t
+; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
+; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v28, v28, a4, v0.t
-; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
+; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v20, v12, 24, v0.t
+; RV32-NEXT:    mv a5, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a5), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
-; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v28, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v28, v28, a2, v0.t
-; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
-; RV32-NEXT:    vand.vx v28, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v28, v28, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v28, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v20, a1
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v20, a1
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v20, a1
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v8i64:
@@ -2063,6 +2113,23 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v8i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v8, a1
@@ -2075,14 +2142,9 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4
 ; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    lui a5, 5
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v20, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v20, v20, a5, v0
+; RV32-NEXT:    mv a5, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v20, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v20
 ; RV32-NEXT:    vsll.vi v24, v24, 8
@@ -2092,43 +2154,41 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    vsrl.vx v24, v8, a3
 ; RV32-NEXT:    vand.vx v24, v24, a2
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    vand.vx v24, v24, a4
+; RV32-NEXT:    vsrl.vi v8, v8, 8
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v8i64_unmasked:
@@ -2201,250 +2261,129 @@ declare <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64>, <15 x i1>, i32)
 define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT:    vmv1r.v v1, v0
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v24, v8, a3, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a4, a1, -256
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    vsll.vx v24, v24, a5, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a6, 4080
-; RV32-NEXT:    vand.vx v24, v8, a6, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    lui a7, 1044480
-; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT:    addi a7, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    slli a7, a7, 3
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    slli a7, a7, 4
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    slli a7, a7, 4
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v24, v8, a5, v0.t
-; RV32-NEXT:    vand.vx v16, v24, a4, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a6, v0.t
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 4, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v15i64:
@@ -2530,80 +2469,87 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsll.vx v16, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    addi a4, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    li a4, 32
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    lui a5, 349525
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    lui a6, 1044480
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a6, v0
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v8, a4
+; RV32-NEXT:    vsll.vi v0, v16, 24
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    lui a6, 4080
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a6
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a7, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vand.vx v8, v8, a6
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v0, v8, a3
+; RV32-NEXT:    vand.vx v0, v0, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 24
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a5
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
@@ -2612,7 +2558,7 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v15i64_unmasked:
@@ -2685,250 +2631,129 @@ declare <16 x i64> @llvm.vp.bitreverse.v16i64(<16 x i64>, <16 x i1>, i32)
 define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT:    vmv1r.v v1, v0
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v24, v8, a3, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a4, a1, -256
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    vsll.vx v24, v24, a5, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 48
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a6, 4080
-; RV32-NEXT:    vand.vx v24, v8, a6, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    lui a7, 1044480
-; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT:    addi a7, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    slli a7, a7, 3
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    slli a7, a7, 4
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    slli a7, a7, 4
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v24, v8, a5, v0.t
-; RV32-NEXT:    vand.vx v16, v24, a4, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a6, v0.t
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 4, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v16i64:
@@ -3014,80 +2839,87 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bitreverse_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsll.vx v16, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    addi a4, sp, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    li a4, 32
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    lui a5, 349525
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    lui a6, 1044480
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a6, v0
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v8, a4
+; RV32-NEXT:    vsll.vi v0, v16, 24
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    lui a6, 4080
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a6
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a7, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vand.vx v8, v8, a6
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a5, sp, 48
+; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v0, v8, a3
+; RV32-NEXT:    vand.vx v0, v0, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 24
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a5
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
@@ -3096,7 +2928,7 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bitreverse_v16i64_unmasked:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index 1dd84b1ea8178a..92ca238e08763a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -181,8 +181,25 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
 define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: bitreverse_v2i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
 ; RV32-NEXT:    li a2, 40
@@ -191,61 +208,49 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v10, v10, a3
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vmv.v.i v0, 5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v10, v10, a4, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v11, v8, 8
-; RV32-NEXT:    vand.vv v11, v11, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    mv a4, sp
+; RV32-NEXT:    vlse64.v v11, (a4), zero
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v11, v9
-; RV32-NEXT:    vsll.vx v11, v8, a1
+; RV32-NEXT:    vand.vx v10, v10, a4
+; RV32-NEXT:    vsrl.vi v12, v8, 8
+; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vor.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v9, v10, v9
+; RV32-NEXT:    vsll.vx v10, v8, a1
 ; RV32-NEXT:    vand.vx v12, v8, a3
 ; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vand.vv v10, v8, v10
-; RV32-NEXT:    vsll.vi v10, v10, 8
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vand.vx v12, v8, a4
+; RV32-NEXT:    vsll.vi v12, v12, 24
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vlse64.v v11, (a1), zero
+; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v11
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bitreverse_v2i64:
@@ -736,8 +741,25 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
 define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32-LABEL: bitreverse_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX2-RV32-NEXT:    lui a1, 1044480
+; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    lui a1, 61681
+; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    lui a1, 209715
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    lui a1, 349525
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32-NEXT:    li a1, 56
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v10, v8, a1
 ; LMULMAX2-RV32-NEXT:    li a2, 40
@@ -746,63 +768,49 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32-NEXT:    addi a3, a3, -256
 ; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a3
 ; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    li a4, 85
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a4
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v12, 0
-; LMULMAX2-RV32-NEXT:    lui a4, 1044480
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsrl.vi v14, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v14, v12
-; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 24
+; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 24
+; LMULMAX2-RV32-NEXT:    mv a4, sp
+; LMULMAX2-RV32-NEXT:    vlse64.v v14, (a4), zero
 ; LMULMAX2-RV32-NEXT:    lui a4, 4080
-; LMULMAX2-RV32-NEXT:    vand.vx v16, v16, a4
-; LMULMAX2-RV32-NEXT:    vor.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v14, v10
-; LMULMAX2-RV32-NEXT:    vsll.vx v14, v8, a1
+; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a4
+; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 8
+; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
+; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
+; LMULMAX2-RV32-NEXT:    vsll.vx v12, v8, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a3
 ; LMULMAX2-RV32-NEXT:    vsll.vx v16, v16, a2
-; LMULMAX2-RV32-NEXT:    vor.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v8, v12
-; LMULMAX2-RV32-NEXT:    vsll.vi v12, v12, 8
-; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a4
-; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v14, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT:    vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32-NEXT:    vlse64.v v14, (a1), zero
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v12, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v14
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vlse64.v v12, (a1), zero
 ; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 4
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
 ; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32-NEXT:    vlse64.v v12, (a1), zero
 ; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 2
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
 ; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bitreverse_v4i64:
@@ -869,110 +877,115 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ;
 ; LMULMAX1-RV32-LABEL: bitreverse_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v10, (a1)
 ; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
+; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
+; LMULMAX1-RV32-NEXT:    vle64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX1-RV32-NEXT:    lui a2, 1044480
+; LMULMAX1-RV32-NEXT:    sw a2, 0(sp)
+; LMULMAX1-RV32-NEXT:    lui a2, 61681
+; LMULMAX1-RV32-NEXT:    addi a2, a2, -241
+; LMULMAX1-RV32-NEXT:    sw a2, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 24(sp)
+; LMULMAX1-RV32-NEXT:    lui a2, 209715
+; LMULMAX1-RV32-NEXT:    addi a2, a2, 819
+; LMULMAX1-RV32-NEXT:    sw a2, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 16(sp)
+; LMULMAX1-RV32-NEXT:    lui a2, 349525
+; LMULMAX1-RV32-NEXT:    addi a2, a2, 1365
+; LMULMAX1-RV32-NEXT:    sw a2, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 8(sp)
 ; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v10, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v9, a2
 ; LMULMAX1-RV32-NEXT:    li a3, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v10, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v9, a3
 ; LMULMAX1-RV32-NEXT:    lui a4, 16
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
 ; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v9
-; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v9, 0
-; LMULMAX1-RV32-NEXT:    lui a5, 1044480
-; LMULMAX1-RV32-NEXT:    vmerge.vxm v9, v9, a5, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v10, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v10, 24
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
+; LMULMAX1-RV32-NEXT:    mv a5, sp
+; LMULMAX1-RV32-NEXT:    vlse64.v v12, (a5), zero
 ; LMULMAX1-RV32-NEXT:    lui a5, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v12, v11
-; LMULMAX1-RV32-NEXT:    vsll.vx v12, v10, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v10, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v10, v9
-; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a5
-; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 24
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 4
-; LMULMAX1-RV32-NEXT:    lui a6, 61681
-; LMULMAX1-RV32-NEXT:    addi a6, a6, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a6
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v12
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 4
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v9, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 2
-; LMULMAX1-RV32-NEXT:    lui a6, 209715
-; LMULMAX1-RV32-NEXT:    addi a6, a6, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a6
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v9, v12
+; LMULMAX1-RV32-NEXT:    vsll.vi v11, v11, 8
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v9, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v9, v9, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v13, v9
+; LMULMAX1-RV32-NEXT:    addi a6, sp, 24
+; LMULMAX1-RV32-NEXT:    vlse64.v v13, (a6), zero
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
 ; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v13
-; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 1
-; LMULMAX1-RV32-NEXT:    lui a6, 349525
-; LMULMAX1-RV32-NEXT:    addi a6, a6, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a6
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v14
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT:    addi a6, sp, 16
+; LMULMAX1-RV32-NEXT:    vlse64.v v11, (a6), zero
+; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v10, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 2
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v11
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT:    addi a6, sp, 8
+; LMULMAX1-RV32-NEXT:    vlse64.v v14, (a6), zero
+; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v10, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 1
 ; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v14
-; LMULMAX1-RV32-NEXT:    vadd.vv v10, v10, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v14
+; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v9
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v10, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v15, v8, a3
 ; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v15, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v16, v8, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v16, v16, a5
-; LMULMAX1-RV32-NEXT:    vor.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v15, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v15, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v8, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v16, v8, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v16, v16, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v15, v16, v15
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v15, v10
 ; LMULMAX1-RV32-NEXT:    vsll.vx v15, v8, a2
 ; LMULMAX1-RV32-NEXT:    vand.vx v16, v8, a4
 ; LMULMAX1-RV32-NEXT:    vsll.vx v16, v16, a3
 ; LMULMAX1-RV32-NEXT:    vor.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v8, v9
-; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v15, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT:    vand.vx v16, v8, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v16, v16, 24
 ; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v9, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v15, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v13
 ; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v13
+; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v10, v8
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 2
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v11
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v11
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v9, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v14
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v10, v8
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v14
 ; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v14
 ; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v9, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v10, (a1)
+; LMULMAX1-RV32-NEXT:    vse64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bitreverse_v4i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index e20a36e184725a..c94df1117ed181 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -425,42 +425,43 @@ declare <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64>, <2 x i1>, i32)
 define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bswap_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v9, v0
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v11, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v11, v11, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
-; RV32-NEXT:    vsrl.vi v11, v8, 8, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
+; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v10, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vand.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vsrl.vi v13, v8, 24, v0.t
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v13, v13, a0, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
-; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
-; RV32-NEXT:    vsll.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v13, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v13, v13, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
-; RV32-NEXT:    vand.vx v13, v8, a0, v0.t
-; RV32-NEXT:    vsll.vi v13, v13, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v13, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v11, v8, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v2i64:
@@ -502,6 +503,11 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bswap_v2i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a1
@@ -514,12 +520,9 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v10, v8, a4
 ; RV32-NEXT:    vsll.vi v10, v10, 24
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v11
 ; RV32-NEXT:    vsll.vi v12, v12, 8
@@ -529,13 +532,14 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vx v12, v8, a3
 ; RV32-NEXT:    vand.vx v12, v12, a2
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vand.vx v12, v12, a4
+; RV32-NEXT:    vsrl.vi v8, v8, 8
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v9, v8
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v2i64_unmasked:
@@ -581,43 +585,43 @@ declare <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64>, <4 x i1>, i32)
 define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bswap_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v10, v0
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v14, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v14, v14, a3, v0.t
-; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
-; RV32-NEXT:    vsrl.vi v14, v8, 8, v0.t
-; RV32-NEXT:    li a4, 85
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v14, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vand.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vsrl.vi v18, v8, 24, v0.t
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v18, v18, a0, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
-; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
-; RV32-NEXT:    vsll.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v18, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v18, v18, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
-; RV32-NEXT:    vand.vx v18, v8, a0, v0.t
-; RV32-NEXT:    vsll.vi v18, v18, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v18, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v14, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v4i64:
@@ -659,6 +663,11 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bswap_v4i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a1
@@ -671,13 +680,9 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v12, v8, a4
 ; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    li a5, 85
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v14, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v14, v14, a5, v0
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v14, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v14
 ; RV32-NEXT:    vsll.vi v16, v16, 8
@@ -687,13 +692,14 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vx v16, v8, a3
 ; RV32-NEXT:    vand.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    vsrl.vi v8, v8, 8
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v4i64_unmasked:
@@ -739,44 +745,43 @@ declare <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64>, <8 x i1>, i32)
 define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bswap_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v12, v0
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v20, v20, a3, v0.t
-; RV32-NEXT:    vor.vv v20, v20, v16, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    lui a4, 5
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
+; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v20, v12, 24, v0.t
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsrl.vi v28, v8, 24, v0.t
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v28, v28, a0, v0.t
-; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
+; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
-; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v28, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v28, v28, a2, v0.t
-; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
-; RV32-NEXT:    vand.vx v28, v8, a0, v0.t
-; RV32-NEXT:    vsll.vi v28, v28, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v28, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v8i64:
@@ -818,6 +823,11 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_bswap_v8i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v8, a1
@@ -830,14 +840,9 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v8, a4
 ; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    lui a5, 5
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v20, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v20, v20, a5, v0
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v20, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v20
 ; RV32-NEXT:    vsll.vi v24, v24, 8
@@ -847,13 +852,14 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vx v24, v8, a3
 ; RV32-NEXT:    vand.vx v24, v24, a2
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v20, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    vand.vx v24, v24, a4
+; RV32-NEXT:    vsrl.vi v8, v8, 8
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v8i64_unmasked:
@@ -906,114 +912,72 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT:    vmv1r.v v1, v0
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a2, 16
 ; RV32-NEXT:    addi a2, a2, -256
 ; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 3
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    li a5, 32
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    lui a6, 349525
-; RV32-NEXT:    addi a6, a6, 1365
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    lui a7, 1044480
-; RV32-NEXT:    vmv.v.x v0, a6
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
@@ -1089,51 +1053,46 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vsll.vx v16, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vor.vv v16, v16, v24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    li a4, 32
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    lui a5, 349525
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a5, v0
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v8, a4
+; RV32-NEXT:    vsll.vi v0, v16, 24
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v0, v8, a3
+; RV32-NEXT:    vand.vx v0, v0, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 24
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
@@ -1190,114 +1149,72 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT:    vmv1r.v v1, v0
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a2, 16
 ; RV32-NEXT:    addi a2, a2, -256
 ; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 3
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    li a5, 32
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    lui a6, 349525
-; RV32-NEXT:    addi a6, a6, 1365
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    lui a7, 1044480
-; RV32-NEXT:    vmv.v.x v0, a6
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
@@ -1373,51 +1290,46 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vsll.vx v16, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vor.vv v16, v16, v24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    li a4, 32
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    lui a5, 349525
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a5
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vxm v24, v24, a5, v0
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v8, a4
+; RV32-NEXT:    vsll.vi v0, v16, 24
+; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v0, v8, a3
+; RV32-NEXT:    vand.vx v0, v0, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 24
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 7006409c6eb878..c4b6eac8b8fe21 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -87,8 +87,13 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
 define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: bswap_v2i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
 ; RV32-NEXT:    li a2, 40
@@ -97,31 +102,28 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v10, v10, a3
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vmv.v.i v0, 5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v10, v10, a4, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v11, v8, 8
-; RV32-NEXT:    vand.vv v11, v11, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vlse64.v v11, (a4), zero
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v11, v9
-; RV32-NEXT:    vsll.vx v11, v8, a1
+; RV32-NEXT:    vand.vx v10, v10, a4
+; RV32-NEXT:    vsrl.vi v12, v8, 8
+; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vor.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v9, v10, v9
+; RV32-NEXT:    vsll.vx v10, v8, a1
 ; RV32-NEXT:    vand.vx v12, v8, a3
 ; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vand.vv v10, v8, v10
-; RV32-NEXT:    vsll.vi v10, v10, 8
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vand.vx v12, v8, a4
+; RV32-NEXT:    vsll.vi v12, v12, 24
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bswap_v2i64:
@@ -357,8 +359,13 @@ declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
 define void @bswap_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    sw zero, 12(sp)
+; LMULMAX2-RV32-NEXT:    lui a1, 1044480
+; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32-NEXT:    li a1, 56
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v10, v8, a1
 ; LMULMAX2-RV32-NEXT:    li a2, 40
@@ -367,33 +374,28 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32-NEXT:    addi a3, a3, -256
 ; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a3
 ; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    li a4, 85
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a4
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v12, 0
-; LMULMAX2-RV32-NEXT:    lui a4, 1044480
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsrl.vi v14, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v14, v12
-; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 24
+; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 24
+; LMULMAX2-RV32-NEXT:    addi a4, sp, 8
+; LMULMAX2-RV32-NEXT:    vlse64.v v14, (a4), zero
 ; LMULMAX2-RV32-NEXT:    lui a4, 4080
-; LMULMAX2-RV32-NEXT:    vand.vx v16, v16, a4
-; LMULMAX2-RV32-NEXT:    vor.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v14, v10
-; LMULMAX2-RV32-NEXT:    vsll.vx v14, v8, a1
+; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a4
+; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 8
+; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
+; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
+; LMULMAX2-RV32-NEXT:    vsll.vx v12, v8, a1
 ; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a3
 ; LMULMAX2-RV32-NEXT:    vsll.vx v16, v16, a2
-; LMULMAX2-RV32-NEXT:    vor.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v8, v12
-; LMULMAX2-RV32-NEXT:    vsll.vi v12, v12, 8
-; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a4
-; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v14, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT:    vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v12, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v4i64:
@@ -433,65 +435,67 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v9, (a0)
+; LMULMAX1-RV32-NEXT:    vle64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    sw zero, 12(sp)
+; LMULMAX1-RV32-NEXT:    lui a2, 1044480
+; LMULMAX1-RV32-NEXT:    sw a2, 8(sp)
 ; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v9, a2
 ; LMULMAX1-RV32-NEXT:    li a3, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v9, a3
 ; LMULMAX1-RV32-NEXT:    lui a4, 16
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
 ; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v11, 0
-; LMULMAX1-RV32-NEXT:    lui a5, 1044480
-; LMULMAX1-RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 24
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
+; LMULMAX1-RV32-NEXT:    addi a5, sp, 8
+; LMULMAX1-RV32-NEXT:    vlse64.v v12, (a5), zero
 ; LMULMAX1-RV32-NEXT:    lui a5, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT:    vsll.vx v12, v8, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v8, v11
-; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v12, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v9, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vx v12, v9, a3
-; LMULMAX1-RV32-NEXT:    vand.vx v12, v12, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v9, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v9, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT:    vsll.vx v12, v9, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v9, v11
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v9, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v9, v12
 ; LMULMAX1-RV32-NEXT:    vsll.vi v11, v11, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a5
-; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v9, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v9, v9, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v13, v9
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v12, v9
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a1)
+; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsll.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v13, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v11, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX1-RV32-NEXT:    vse64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v4i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index b9394e35e2b029..e7c459cfc9ff7d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1449,6 +1449,24 @@ declare <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32)
 define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v2i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -1463,22 +1481,17 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1486,20 +1499,19 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v2i64:
@@ -1556,6 +1568,24 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v2i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -1570,22 +1600,17 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v9
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1593,20 +1618,19 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v2i64_unmasked:
@@ -1667,6 +1691,24 @@ declare <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32)
 define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v4i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
@@ -1681,22 +1723,17 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1704,20 +1741,19 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v4i64:
@@ -1774,6 +1810,24 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v4i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -1788,22 +1842,17 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1811,20 +1860,19 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v4i64_unmasked:
@@ -1885,6 +1933,24 @@ declare <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32)
 define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -1899,22 +1965,17 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1922,20 +1983,19 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v8i64:
@@ -1992,6 +2052,24 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v8i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -2006,22 +2084,17 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -2029,20 +2102,19 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v8i64_unmasked:
@@ -2103,6 +2175,24 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
 define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v15i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2117,22 +2207,17 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -2140,20 +2225,19 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v15i64:
@@ -2210,6 +2294,24 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v15i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -2224,22 +2326,17 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -2247,20 +2344,19 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v15i64_unmasked:
@@ -2321,6 +2417,24 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
 define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v16i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2335,22 +2449,17 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -2358,20 +2467,19 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v16i64:
@@ -2428,6 +2536,24 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v16i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -2442,22 +2568,17 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -2465,20 +2586,19 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v16i64_unmasked:
@@ -2539,23 +2659,39 @@ declare <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
 define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB34_2
+; RV32-NEXT:    bltu a0, a3, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB34_2:
@@ -2573,124 +2709,115 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -2701,7 +2828,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -2715,67 +2842,54 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -2784,23 +2898,26 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v32i64:
@@ -2920,21 +3037,36 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    vmv8r.v v0, v16
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB35_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:  .LBB35_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB35_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
@@ -2945,127 +3077,109 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    vor.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vx v0, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v24, 1
+; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v16
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v32i64_unmasked:
@@ -4576,6 +4690,24 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v2i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -4590,22 +4722,17 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -4613,20 +4740,19 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v2i64:
@@ -4683,6 +4809,24 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v2i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -4697,22 +4841,17 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v9
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4720,20 +4859,19 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v2i64_unmasked:
@@ -4792,6 +4930,24 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v4i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
@@ -4806,22 +4962,17 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -4829,20 +4980,19 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v4i64:
@@ -4899,6 +5049,24 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -4913,22 +5081,17 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4936,20 +5099,19 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
@@ -5008,6 +5170,24 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -5022,22 +5202,17 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -5045,20 +5220,19 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v8i64:
@@ -5115,6 +5289,24 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v8i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -5129,22 +5321,17 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -5152,20 +5339,19 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v8i64_unmasked:
@@ -5224,6 +5410,24 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -5238,22 +5442,17 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -5261,20 +5460,19 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v15i64:
@@ -5331,6 +5529,24 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -5345,22 +5561,17 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -5368,20 +5579,19 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked:
@@ -5440,6 +5650,24 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -5454,22 +5682,17 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -5477,20 +5700,19 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v16i64:
@@ -5547,6 +5769,24 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -5561,22 +5801,17 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -5584,20 +5819,19 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked:
@@ -5656,23 +5890,39 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB70_2
+; RV32-NEXT:    bltu a0, a3, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB70_2:
@@ -5690,124 +5940,115 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -5818,7 +6059,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -5832,67 +6073,54 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -5901,23 +6129,26 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v32i64:
@@ -6037,21 +6268,36 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    vmv8r.v v0, v16
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB71_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:  .LBB71_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB71_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB71_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
@@ -6062,127 +6308,109 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    lui a3, 61681
-; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    vor.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vx v0, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsrl.vi v8, v24, 1
+; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v16
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v32i64_unmasked:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index fb8e2429b3a394..017b8612f7e69a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -460,8 +460,25 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
 define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: ctlz_v2i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 2
@@ -475,44 +492,30 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-NEXT:    li a1, 32
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v9, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v9, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV32I-NEXT:    lui a1, 349525
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v9, v9, v10
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v11, v9
 ; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    lui a1, 209715
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v9
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v8, v10
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v11
 ; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    lui a1, 61681
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    lui a1, 4112
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: ctlz_v2i64:
@@ -1157,8 +1160,25 @@ declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
 define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: ctlz_v4i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 2
@@ -1172,44 +1192,30 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-NEXT:    li a1, 32
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v10, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v10, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32I-NEXT:    lui a1, 349525
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v10, v12
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v14, v10
 ; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    lui a1, 209715
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v12, v8, v10
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v12
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v14
 ; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    lui a1, 61681
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    lui a1, 4112
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v12
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: ctlz_v4i64:
@@ -1782,8 +1788,25 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: ctlz_zero_undef_v2i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 2
@@ -1797,44 +1820,30 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-NEXT:    li a1, 32
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v9, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v9, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV32I-NEXT:    lui a1, 349525
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v9, v9, v10
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v11, v9
 ; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    lui a1, 209715
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v9
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v8, v10
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v11
 ; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    lui a1, 61681
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    lui a1, 4112
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: ctlz_zero_undef_v2i64:
@@ -2449,8 +2458,25 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: ctlz_zero_undef_v4i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 2
@@ -2464,44 +2490,30 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-NEXT:    li a1, 32
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v10, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v10, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32I-NEXT:    lui a1, 349525
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v10, v12
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v14, v10
 ; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    lui a1, 209715
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v12, v8, v10
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v12
 ; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v14
 ; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    lui a1, 61681
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    lui a1, 4112
-; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v12
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: ctlz_zero_undef_v4i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index 71a621c10257d1..10c576c1251d23 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1073,19 +1073,35 @@ declare <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64>, <2 x i1>, i32)
 define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1093,20 +1109,19 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v2i64:
@@ -1149,19 +1164,35 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1169,20 +1200,19 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v2i64_unmasked:
@@ -1229,19 +1259,35 @@ declare <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64>, <4 x i1>, i32)
 define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1249,20 +1295,19 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v4i64:
@@ -1305,19 +1350,35 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1325,20 +1386,19 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v4i64_unmasked:
@@ -1385,19 +1445,35 @@ declare <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64>, <8 x i1>, i32)
 define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1405,20 +1481,19 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v8i64:
@@ -1461,19 +1536,35 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1481,20 +1572,19 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v8i64_unmasked:
@@ -1541,20 +1631,35 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32)
 define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1562,20 +1667,19 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v15i64:
@@ -1618,20 +1722,35 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1639,20 +1758,19 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v15i64_unmasked:
@@ -1699,20 +1817,35 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32)
 define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1720,20 +1853,19 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v16i64:
@@ -1776,20 +1908,35 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1797,20 +1944,19 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v16i64_unmasked:
@@ -1857,22 +2003,38 @@ declare <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
@@ -1880,93 +2042,88 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    addi a2, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a2, a2, a4
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a2, a2, a4
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a2, a2, a4
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v16, v8, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
@@ -1975,7 +2132,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
@@ -1987,53 +2144,53 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -2041,26 +2198,26 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v32i64:
@@ -2153,19 +2310,35 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
@@ -2173,24 +2346,21 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a2, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a2, a2, a4
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -2198,31 +2368,29 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a2, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a2), zero
+; RV32-NEXT:    addi a2, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v24, v8
+; RV32-NEXT:    vmul.vv v16, v16, v8
 ; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v8, v24, a1
+; RV32-NEXT:    vsrl.vx v8, v16, a1
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
@@ -2232,43 +2400,43 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v24, v8, 1
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v24, v8, v24
-; RV32-NEXT:    vand.vv v8, v24, v0
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsub.vv v16, v8, v16
+; RV32-NEXT:    vand.vv v8, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v32i64_unmasked:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index 60af2188e754fd..f027de4f1342bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -288,42 +288,49 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
 define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32-LABEL: ctpop_v2i64:
 ; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32-NEXT:    vsrl.vi v11, v8, 1
+; LMULMAX2-RV32-NEXT:    vand.vv v9, v11, v9
+; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32-NEXT:    vand.vv v9, v8, v10
+; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32-NEXT:    mv a1, sp
+; LMULMAX2-RV32-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32-NEXT:    vsrl.vi v11, v8, 4
+; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v11
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    li a1, 56
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctpop_v2i64:
@@ -364,42 +371,49 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ;
 ; LMULMAX1-RV32-LABEL: ctpop_v2i64:
 ; LMULMAX1-RV32:       # %bb.0:
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    lui a1, 209715
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    lui a1, 61681
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
 ; LMULMAX1-RV32-NEXT:    lui a1, 4112
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 24
+; LMULMAX1-RV32-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 1
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v11, v9
+; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v8, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 8
+; LMULMAX1-RV32-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX1-RV32-NEXT:    mv a1, sp
+; LMULMAX1-RV32-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 4
+; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v11
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v10
 ; LMULMAX1-RV32-NEXT:    li a1, 56
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctpop_v2i64:
@@ -829,42 +843,49 @@ declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
 define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32-LABEL: ctpop_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32-NEXT:    vsrl.vi v14, v8, 1
+; LMULMAX2-RV32-NEXT:    vand.vv v10, v14, v10
+; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vand.vv v10, v8, v12
+; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32-NEXT:    mv a1, sp
+; LMULMAX2-RV32-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32-NEXT:    vsrl.vi v14, v8, 4
+; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    li a1, 56
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctpop_v4i64:
@@ -905,57 +926,64 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ;
 ; LMULMAX1-RV32-LABEL: ctpop_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX1-RV32-NEXT:    vle64.v v9, (a1)
 ; LMULMAX1-RV32-NEXT:    lui a2, 349525
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v11, a2
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    sw a2, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 24(sp)
 ; LMULMAX1-RV32-NEXT:    lui a2, 209715
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a2
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT:    sw a2, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 16(sp)
 ; LMULMAX1-RV32-NEXT:    lui a2, 61681
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a2
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT:    sw a2, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 8(sp)
 ; LMULMAX1-RV32-NEXT:    lui a2, 4112
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 257
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a2
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vi v14, v9, 1
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v14, v11
-; LMULMAX1-RV32-NEXT:    vsub.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v9, v10
+; LMULMAX1-RV32-NEXT:    sw a2, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 0(sp)
+; LMULMAX1-RV32-NEXT:    addi a2, sp, 24
+; LMULMAX1-RV32-NEXT:    vlse64.v v10, (a2), zero
+; LMULMAX1-RV32-NEXT:    addi a2, sp, 16
+; LMULMAX1-RV32-NEXT:    vlse64.v v11, (a2), zero
+; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v9, 1
+; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v10
+; LMULMAX1-RV32-NEXT:    vsub.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v12, v9, v11
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v11, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT:    vadd.vv v9, v12, v9
+; LMULMAX1-RV32-NEXT:    addi a2, sp, 8
+; LMULMAX1-RV32-NEXT:    vlse64.v v12, (a2), zero
+; LMULMAX1-RV32-NEXT:    mv a2, sp
+; LMULMAX1-RV32-NEXT:    vlse64.v v13, (a2), zero
+; LMULMAX1-RV32-NEXT:    vsrl.vi v14, v9, 4
+; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v14
 ; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v12
 ; LMULMAX1-RV32-NEXT:    vmul.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT:    li a2, 56
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v9, a2
-; LMULMAX1-RV32-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a1)
+; LMULMAX1-RV32-NEXT:    vsrl.vi v14, v8, 1
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v14, v10
+; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v8, v11
+; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v11
+; LMULMAX1-RV32-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v13
+; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a2
+; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX1-RV32-NEXT:    vse64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctpop_v4i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 146091a1ff6ae4..95141df2cc5a14 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1233,26 +1233,39 @@ declare <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32)
 define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v2i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1260,20 +1273,19 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v2i64:
@@ -1320,26 +1332,39 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v2i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1347,20 +1372,19 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v2i64_unmasked:
@@ -1411,26 +1435,39 @@ declare <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32)
 define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v4i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1438,20 +1475,19 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v4i64:
@@ -1498,26 +1534,39 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v4i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1525,20 +1574,19 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v4i64_unmasked:
@@ -1589,26 +1637,39 @@ declare <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32)
 define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1616,20 +1677,19 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v8i64:
@@ -1676,26 +1736,39 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v8i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1703,20 +1776,19 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v8i64_unmasked:
@@ -1767,27 +1839,39 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
 define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v15i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1795,20 +1879,19 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v15i64:
@@ -1855,27 +1938,39 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v15i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -1883,20 +1978,19 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v15i64_unmasked:
@@ -1947,27 +2041,39 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
 define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v16i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1975,20 +2081,19 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v16i64:
@@ -2035,27 +2140,39 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v16i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -2063,20 +2180,19 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v16i64_unmasked:
@@ -2127,165 +2243,155 @@ declare <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
 define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB34_2
-; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB34_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 56
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, -1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 56
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
+; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -2294,107 +2400,88 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 6
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v32i64:
@@ -2494,133 +2581,129 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vmv8r.v v0, v16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a2
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v8, v0, v8
-; RV32-NEXT:    vsub.vx v0, v0, a2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsub.vx v8, v24, a2
+; RV32-NEXT:    vnot.v v24, v24
+; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v16
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v32i64_unmasked:
@@ -3895,26 +3978,39 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v2i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -3922,20 +4018,19 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64:
@@ -3982,26 +4077,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v10
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v10, v8, v9
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4009,20 +4117,19 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
@@ -4071,26 +4178,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v4i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -4098,20 +4218,19 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64:
@@ -4158,26 +4277,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v12, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v12
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v12, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4185,20 +4317,19 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vadd.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
@@ -4247,26 +4378,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -4274,20 +4418,19 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64:
@@ -4334,26 +4477,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4361,20 +4517,19 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
@@ -4423,27 +4578,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v15i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -4451,20 +4618,19 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64:
@@ -4511,27 +4677,39 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4539,20 +4717,19 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
@@ -4601,27 +4778,39 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v16i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -4629,20 +4818,19 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64:
@@ -4689,27 +4877,39 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
@@ -4717,20 +4917,19 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    lui a2, 4112
-; RV32-NEXT:    addi a2, a2, 257
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
@@ -4779,165 +4978,155 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v32i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
-; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a1, .LBB70_2
-; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB70_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB70_2:
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
+; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 56
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, -1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 56
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
+; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
@@ -4946,107 +5135,88 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 6
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v32i64:
@@ -5146,133 +5316,129 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 40(sp)
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 32(sp)
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vmv8r.v v0, v16
+; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    mv a1, a0
 ; RV32-NEXT:    bltu a0, a2, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a2
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vand.vv v16, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v24
+; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vxor.vv v8, v0, v8
-; RV32-NEXT:    vsub.vx v0, v0, a2
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsub.vx v8, v24, a2
+; RV32-NEXT:    vnot.v v24, v24
+; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v16
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v0
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v32i64_unmasked:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index c97163ca1669ac..9cdcf59ba914d9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -456,49 +456,52 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
 define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: cttz_v2i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32I-NEXT:    li a1, 1
-; LMULMAX2-RV32I-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v10, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; LMULMAX2-RV32I-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v9, v9, v10
-; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v11, v9
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v11
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: cttz_v2i64:
@@ -545,23 +548,20 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32F:       # %bb.0:
 ; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32F-NEXT:    vmv.v.i v9, 0
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32F-NEXT:    vmseq.vv v0, v8, v9
-; LMULMAX2-RV32F-NEXT:    vsub.vv v9, v9, v8
-; LMULMAX2-RV32F-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV32F-NEXT:    vand.vv v9, v8, v9
 ; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v9, v8
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v9
 ; LMULMAX2-RV32F-NEXT:    fsrm a1
-; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v9, 23
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v9, v10, 23
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; LMULMAX2-RV32F-NEXT:    vzext.vf2 v9, v8
+; LMULMAX2-RV32F-NEXT:    vzext.vf2 v10, v9
 ; LMULMAX2-RV32F-NEXT:    li a1, 127
-; LMULMAX2-RV32F-NEXT:    vsub.vx v8, v9, a1
+; LMULMAX2-RV32F-NEXT:    vsub.vx v9, v10, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
 ; LMULMAX2-RV32F-NEXT:    li a1, 64
-; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; LMULMAX2-RV32F-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV32F-NEXT:    ret
 ;
@@ -589,21 +589,18 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32D:       # %bb.0:
 ; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32D-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32D-NEXT:    vmv.v.i v9, 0
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32D-NEXT:    vmseq.vv v0, v8, v9
-; LMULMAX2-RV32D-NEXT:    vsub.vv v9, v9, v8
-; LMULMAX2-RV32D-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32D-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV32D-NEXT:    vand.vv v9, v8, v9
 ; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v8, v8
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v9, v9
 ; LMULMAX2-RV32D-NEXT:    fsrm a1
 ; LMULMAX2-RV32D-NEXT:    li a1, 52
-; LMULMAX2-RV32D-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV32D-NEXT:    vsrl.vx v9, v9, a1
 ; LMULMAX2-RV32D-NEXT:    li a1, 1023
-; LMULMAX2-RV32D-NEXT:    vsub.vx v8, v8, a1
+; LMULMAX2-RV32D-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT:    vmseq.vi v0, v8, 0
 ; LMULMAX2-RV32D-NEXT:    li a1, 64
-; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; LMULMAX2-RV32D-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV32D-NEXT:    ret
 ;
@@ -626,46 +623,24 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV64D-NEXT:    ret
 ;
-; LMULMAX8-RV32-LABEL: cttz_v2i64:
-; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v9, 0
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmseq.vv v0, v8, v9
-; LMULMAX8-RV32-NEXT:    vsub.vv v9, v9, v8
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v8, v8
-; LMULMAX8-RV32-NEXT:    fsrm a1
-; LMULMAX8-RV32-NEXT:    li a1, 52
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    li a1, 1023
-; LMULMAX8-RV32-NEXT:    vsub.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    li a1, 64
-; LMULMAX8-RV32-NEXT:    vmerge.vxm v8, v8, a1, v0
-; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    ret
-;
-; LMULMAX8-RV64-LABEL: cttz_v2i64:
-; LMULMAX8-RV64:       # %bb.0:
-; LMULMAX8-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    vrsub.vi v9, v8, 0
-; LMULMAX8-RV64-NEXT:    vand.vv v9, v8, v9
-; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v9, v9
-; LMULMAX8-RV64-NEXT:    fsrm a1
-; LMULMAX8-RV64-NEXT:    li a1, 52
-; LMULMAX8-RV64-NEXT:    vsrl.vx v9, v9, a1
-; LMULMAX8-RV64-NEXT:    li a1, 1023
-; LMULMAX8-RV64-NEXT:    vsub.vx v9, v9, a1
-; LMULMAX8-RV64-NEXT:    vmseq.vi v0, v8, 0
-; LMULMAX8-RV64-NEXT:    li a1, 64
-; LMULMAX8-RV64-NEXT:    vmerge.vxm v8, v9, a1, v0
-; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    ret
+; LMULMAX8-LABEL: cttz_v2i64:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX8-NEXT:    vle64.v v8, (a0)
+; LMULMAX8-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX8-NEXT:    vand.vv v9, v8, v9
+; LMULMAX8-NEXT:    fsrmi a1, 1
+; LMULMAX8-NEXT:    vfcvt.f.xu.v v9, v9
+; LMULMAX8-NEXT:    fsrm a1
+; LMULMAX8-NEXT:    li a1, 52
+; LMULMAX8-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX8-NEXT:    li a1, 1023
+; LMULMAX8-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX8-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX8-NEXT:    li a1, 64
+; LMULMAX8-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX8-NEXT:    vse64.v v8, (a0)
+; LMULMAX8-NEXT:    ret
 ;
 ; ZVBB-LABEL: cttz_v2i64:
 ; ZVBB:       # %bb.0:
@@ -1140,49 +1115,52 @@ declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
 define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: cttz_v4i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32I-NEXT:    li a1, 1
-; LMULMAX2-RV32I-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v12, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v12
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; LMULMAX2-RV32I-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v12, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v14, v10
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v12
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v14
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v12
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: cttz_v4i64:
@@ -1229,23 +1207,20 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32F:       # %bb.0:
 ; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32F-NEXT:    vmv.v.i v10, 0
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32F-NEXT:    vmseq.vv v0, v8, v10
-; LMULMAX2-RV32F-NEXT:    vsub.vv v10, v10, v8
-; LMULMAX2-RV32F-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV32F-NEXT:    vand.vv v10, v8, v10
 ; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v12, v10
 ; LMULMAX2-RV32F-NEXT:    fsrm a1
-; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v10, 23
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v10, v12, 23
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; LMULMAX2-RV32F-NEXT:    vzext.vf2 v10, v8
+; LMULMAX2-RV32F-NEXT:    vzext.vf2 v12, v10
 ; LMULMAX2-RV32F-NEXT:    li a1, 127
-; LMULMAX2-RV32F-NEXT:    vsub.vx v8, v10, a1
+; LMULMAX2-RV32F-NEXT:    vsub.vx v10, v12, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
 ; LMULMAX2-RV32F-NEXT:    li a1, 64
-; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; LMULMAX2-RV32F-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV32F-NEXT:    ret
 ;
@@ -1273,21 +1248,18 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32D:       # %bb.0:
 ; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32D-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32D-NEXT:    vmv.v.i v10, 0
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32D-NEXT:    vmseq.vv v0, v8, v10
-; LMULMAX2-RV32D-NEXT:    vsub.vv v10, v10, v8
-; LMULMAX2-RV32D-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32D-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV32D-NEXT:    vand.vv v10, v8, v10
 ; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v8, v8
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v10, v10
 ; LMULMAX2-RV32D-NEXT:    fsrm a1
 ; LMULMAX2-RV32D-NEXT:    li a1, 52
-; LMULMAX2-RV32D-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV32D-NEXT:    vsrl.vx v10, v10, a1
 ; LMULMAX2-RV32D-NEXT:    li a1, 1023
-; LMULMAX2-RV32D-NEXT:    vsub.vx v8, v8, a1
+; LMULMAX2-RV32D-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    vmseq.vi v0, v8, 0
 ; LMULMAX2-RV32D-NEXT:    li a1, 64
-; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; LMULMAX2-RV32D-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV32D-NEXT:    ret
 ;
@@ -1310,46 +1282,24 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV64D-NEXT:    ret
 ;
-; LMULMAX8-RV32-LABEL: cttz_v4i64:
-; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v10, 0
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmseq.vv v0, v8, v10
-; LMULMAX8-RV32-NEXT:    vsub.vv v10, v10, v8
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v8, v8
-; LMULMAX8-RV32-NEXT:    fsrm a1
-; LMULMAX8-RV32-NEXT:    li a1, 52
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    li a1, 1023
-; LMULMAX8-RV32-NEXT:    vsub.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    li a1, 64
-; LMULMAX8-RV32-NEXT:    vmerge.vxm v8, v8, a1, v0
-; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    ret
-;
-; LMULMAX8-RV64-LABEL: cttz_v4i64:
-; LMULMAX8-RV64:       # %bb.0:
-; LMULMAX8-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    vrsub.vi v10, v8, 0
-; LMULMAX8-RV64-NEXT:    vand.vv v10, v8, v10
-; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v10, v10
-; LMULMAX8-RV64-NEXT:    fsrm a1
-; LMULMAX8-RV64-NEXT:    li a1, 52
-; LMULMAX8-RV64-NEXT:    vsrl.vx v10, v10, a1
-; LMULMAX8-RV64-NEXT:    li a1, 1023
-; LMULMAX8-RV64-NEXT:    vsub.vx v10, v10, a1
-; LMULMAX8-RV64-NEXT:    vmseq.vi v0, v8, 0
-; LMULMAX8-RV64-NEXT:    li a1, 64
-; LMULMAX8-RV64-NEXT:    vmerge.vxm v8, v10, a1, v0
-; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    ret
+; LMULMAX8-LABEL: cttz_v4i64:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX8-NEXT:    vle64.v v8, (a0)
+; LMULMAX8-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX8-NEXT:    vand.vv v10, v8, v10
+; LMULMAX8-NEXT:    fsrmi a1, 1
+; LMULMAX8-NEXT:    vfcvt.f.xu.v v10, v10
+; LMULMAX8-NEXT:    fsrm a1
+; LMULMAX8-NEXT:    li a1, 52
+; LMULMAX8-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX8-NEXT:    li a1, 1023
+; LMULMAX8-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX8-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX8-NEXT:    li a1, 64
+; LMULMAX8-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX8-NEXT:    vse64.v v8, (a0)
+; LMULMAX8-NEXT:    ret
 ;
 ; ZVBB-LABEL: cttz_v4i64:
 ; ZVBB:       # %bb.0:
@@ -1773,49 +1723,52 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: cttz_zero_undef_v2i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32I-NEXT:    li a1, 1
-; LMULMAX2-RV32I-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v10, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; LMULMAX2-RV32I-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v9, v9, v10
-; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v9
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v11, v9
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v9, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v11, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v11
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: cttz_zero_undef_v2i64:
@@ -1862,10 +1815,7 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32F:       # %bb.0:
 ; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32F-NEXT:    vmv.v.i v9, 0
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32F-NEXT:    vsub.vv v9, v9, v8
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v9, v8, 0
 ; LMULMAX2-RV32F-NEXT:    vand.vv v8, v8, v9
 ; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -1899,10 +1849,7 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32D:       # %bb.0:
 ; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32D-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32D-NEXT:    vmv.v.i v9, 0
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32D-NEXT:    vsub.vv v9, v9, v8
+; LMULMAX2-RV32D-NEXT:    vrsub.vi v9, v8, 0
 ; LMULMAX2-RV32D-NEXT:    vand.vv v8, v8, v9
 ; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v8, v8
@@ -1930,40 +1877,21 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV64D-NEXT:    ret
 ;
-; LMULMAX8-RV32-LABEL: cttz_zero_undef_v2i64:
-; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v9, 0
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vsub.vv v9, v9, v8
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v8, v8
-; LMULMAX8-RV32-NEXT:    fsrm a1
-; LMULMAX8-RV32-NEXT:    li a1, 52
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    li a1, 1023
-; LMULMAX8-RV32-NEXT:    vsub.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    ret
-;
-; LMULMAX8-RV64-LABEL: cttz_zero_undef_v2i64:
-; LMULMAX8-RV64:       # %bb.0:
-; LMULMAX8-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    vrsub.vi v9, v8, 0
-; LMULMAX8-RV64-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v8, v8
-; LMULMAX8-RV64-NEXT:    fsrm a1
-; LMULMAX8-RV64-NEXT:    li a1, 52
-; LMULMAX8-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    li a1, 1023
-; LMULMAX8-RV64-NEXT:    vsub.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    ret
+; LMULMAX8-LABEL: cttz_zero_undef_v2i64:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX8-NEXT:    vle64.v v8, (a0)
+; LMULMAX8-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX8-NEXT:    vand.vv v8, v8, v9
+; LMULMAX8-NEXT:    fsrmi a1, 1
+; LMULMAX8-NEXT:    vfcvt.f.xu.v v8, v8
+; LMULMAX8-NEXT:    fsrm a1
+; LMULMAX8-NEXT:    li a1, 52
+; LMULMAX8-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-NEXT:    li a1, 1023
+; LMULMAX8-NEXT:    vsub.vx v8, v8, a1
+; LMULMAX8-NEXT:    vse64.v v8, (a0)
+; LMULMAX8-NEXT:    ret
 ;
 ; ZVBB-LABEL: cttz_zero_undef_v2i64:
 ; ZVBB:       # %bb.0:
@@ -2414,49 +2342,52 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32I-LABEL: cttz_zero_undef_v4i64:
 ; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, -32
 ; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32I-NEXT:    li a1, 1
-; LMULMAX2-RV32I-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.i v12, -1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v12
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; LMULMAX2-RV32I-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v12, v8, v10
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 8(sp)
 ; LMULMAX2-RV32I-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32I-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 24
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 1
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v14, v10
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v12
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v12
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    addi a1, sp, 8
+; LMULMAX2-RV32I-NEXT:    vlse64.v v10, (a1), zero
+; LMULMAX2-RV32I-NEXT:    mv a1, sp
+; LMULMAX2-RV32I-NEXT:    vlse64.v v12, (a1), zero
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v14, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v14
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v12
 ; LMULMAX2-RV32I-NEXT:    li a1, 56
 ; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
 ; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32I-NEXT:    ret
 ;
 ; LMULMAX2-RV64I-LABEL: cttz_zero_undef_v4i64:
@@ -2503,10 +2434,7 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32F:       # %bb.0:
 ; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32F-NEXT:    vmv.v.i v10, 0
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32F-NEXT:    vsub.vv v10, v10, v8
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v10, v8, 0
 ; LMULMAX2-RV32F-NEXT:    vand.vv v8, v8, v10
 ; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -2540,10 +2468,7 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV32D:       # %bb.0:
 ; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32D-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32D-NEXT:    vmv.v.i v10, 0
-; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32D-NEXT:    vsub.vv v10, v10, v8
+; LMULMAX2-RV32D-NEXT:    vrsub.vi v10, v8, 0
 ; LMULMAX2-RV32D-NEXT:    vand.vv v8, v8, v10
 ; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v8, v8
@@ -2571,40 +2496,21 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV64D-NEXT:    ret
 ;
-; LMULMAX8-RV32-LABEL: cttz_zero_undef_v4i64:
-; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v10, 0
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vsub.vv v10, v10, v8
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v8, v8
-; LMULMAX8-RV32-NEXT:    fsrm a1
-; LMULMAX8-RV32-NEXT:    li a1, 52
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    li a1, 1023
-; LMULMAX8-RV32-NEXT:    vsub.vx v8, v8, a1
-; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    ret
-;
-; LMULMAX8-RV64-LABEL: cttz_zero_undef_v4i64:
-; LMULMAX8-RV64:       # %bb.0:
-; LMULMAX8-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    vrsub.vi v10, v8, 0
-; LMULMAX8-RV64-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
-; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v8, v8
-; LMULMAX8-RV64-NEXT:    fsrm a1
-; LMULMAX8-RV64-NEXT:    li a1, 52
-; LMULMAX8-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    li a1, 1023
-; LMULMAX8-RV64-NEXT:    vsub.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    ret
+; LMULMAX8-LABEL: cttz_zero_undef_v4i64:
+; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX8-NEXT:    vle64.v v8, (a0)
+; LMULMAX8-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX8-NEXT:    vand.vv v8, v8, v10
+; LMULMAX8-NEXT:    fsrmi a1, 1
+; LMULMAX8-NEXT:    vfcvt.f.xu.v v8, v8
+; LMULMAX8-NEXT:    fsrm a1
+; LMULMAX8-NEXT:    li a1, 52
+; LMULMAX8-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-NEXT:    li a1, 1023
+; LMULMAX8-NEXT:    vsub.vx v8, v8, a1
+; LMULMAX8-NEXT:    vse64.v v8, (a0)
+; LMULMAX8-NEXT:    ret
 ;
 ; ZVBB-LABEL: cttz_zero_undef_v4i64:
 ; ZVBB:       # %bb.0:
@@ -2619,3 +2525,6 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
   store <4 x i64> %c, ptr %x
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; LMULMAX8-RV32: {{.*}}
+; LMULMAX8-RV64: {{.*}}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
index 5f1cc2d4b4e961..4a16df1b7c7ff0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
@@ -522,408 +522,216 @@ define <16 x i32> @fshl_v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16
 
 declare <2 x i64> @llvm.vp.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i1>, i32)
 define <2 x i64> @fshr_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshr_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v10, a1, v0.t
-; RV32-NEXT:    vsrl.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v11, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshr_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v11, v10, a1, v0.t
-; RV64-NEXT:    vsrl.vv v9, v9, v11, v0.t
-; RV64-NEXT:    vnot.v v10, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v10, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshr_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vx v11, v10, a1, v0.t
+; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
+; CHECK-NEXT:    vnot.v v10, v10, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.vp.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %res
 }
 
 declare <2 x i64> @llvm.vp.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i1>, i32)
 define <2 x i64> @fshl_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshl_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v10, a1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v11, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v11, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vxor.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV32-NEXT:    vsrl.vi v9, v9, 1, v0.t
-; RV32-NEXT:    vsrl.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshl_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v11, v10, a1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v11, v0.t
-; RV64-NEXT:    vnot.v v10, v10, v0.t
-; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
-; RV64-NEXT:    vsrl.vi v9, v9, 1, v0.t
-; RV64-NEXT:    vsrl.vv v9, v9, v10, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshl_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vx v11, v10, a1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    vnot.v v10, v10, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v9, v9, v10, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
   %res = call <2 x i64> @llvm.vp.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %res
 }
 
 declare <4 x i64> @llvm.vp.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>, <4 x i1>, i32)
 define <4 x i64> @fshr_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshr_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v12, a1, v0.t
-; RV32-NEXT:    vsrl.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v14, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v12, v12, v14, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshr_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v14, v12, a1, v0.t
-; RV64-NEXT:    vsrl.vv v10, v10, v14, v0.t
-; RV64-NEXT:    vnot.v v12, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v12, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshr_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vand.vx v14, v12, a1, v0.t
+; CHECK-NEXT:    vsrl.vv v10, v10, v14, v0.t
+; CHECK-NEXT:    vnot.v v12, v12, v0.t
+; CHECK-NEXT:    vand.vx v12, v12, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
   %res = call <4 x i64> @llvm.vp.fshr.v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %res
 }
 
 declare <4 x i64> @llvm.vp.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>, <4 x i1>, i32)
 define <4 x i64> @fshl_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshl_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v12, a1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v14, v0.t
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v14, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vxor.vv v12, v12, v14, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV32-NEXT:    vsrl.vi v10, v10, 1, v0.t
-; RV32-NEXT:    vsrl.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshl_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v14, v12, a1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v14, v0.t
-; RV64-NEXT:    vnot.v v12, v12, v0.t
-; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
-; RV64-NEXT:    vsrl.vi v10, v10, 1, v0.t
-; RV64-NEXT:    vsrl.vv v10, v10, v12, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshl_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vand.vx v14, v12, a1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    vnot.v v12, v12, v0.t
+; CHECK-NEXT:    vand.vx v12, v12, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v10, v10, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v10, v10, v12, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
   %res = call <4 x i64> @llvm.vp.fshl.v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %res
 }
 
 declare <7 x i64> @llvm.vp.fshr.v7i64(<7 x i64>, <7 x i64>, <7 x i64>, <7 x i1>, i32)
 define <7 x i64> @fshr_v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshr_v7i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV32-NEXT:    vsrl.vv v12, v12, v20, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v20, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshr_v7i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV64-NEXT:    vsrl.vv v12, v12, v20, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshr_v7i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vand.vx v20, v16, a1, v0.t
+; CHECK-NEXT:    vsrl.vv v12, v12, v20, v0.t
+; CHECK-NEXT:    vnot.v v16, v16, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
   %res = call <7 x i64> @llvm.vp.fshr.v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 %evl)
   ret <7 x i64> %res
 }
 
 declare <7 x i64> @llvm.vp.fshl.v7i64(<7 x i64>, <7 x i64>, <7 x i64>, <7 x i1>, i32)
 define <7 x i64> @fshl_v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshl_v7i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v20, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 1, v0.t
-; RV32-NEXT:    vsrl.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshl_v7i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v20, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsrl.vi v12, v12, 1, v0.t
-; RV64-NEXT:    vsrl.vv v12, v12, v16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshl_v7i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vand.vx v20, v16, a1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    vnot.v v16, v16, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v12, v12, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v12, v12, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
   %res = call <7 x i64> @llvm.vp.fshl.v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 %evl)
   ret <7 x i64> %res
 }
 
 declare <8 x i64> @llvm.vp.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i1>, i32)
 define <8 x i64> @fshr_v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshr_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV32-NEXT:    vsrl.vv v12, v12, v20, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v20, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshr_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV64-NEXT:    vsrl.vv v12, v12, v20, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshr_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vand.vx v20, v16, a1, v0.t
+; CHECK-NEXT:    vsrl.vv v12, v12, v20, v0.t
+; CHECK-NEXT:    vnot.v v16, v16, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
   %res = call <8 x i64> @llvm.vp.fshr.v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %res
 }
 
 declare <8 x i64> @llvm.vp.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i1>, i32)
 define <8 x i64> @fshl_v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshl_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV32-NEXT:    vsll.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v20, -1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vxor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV32-NEXT:    vsrl.vi v12, v12, 1, v0.t
-; RV32-NEXT:    vsrl.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshl_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 63
-; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v20, v16, a1, v0.t
-; RV64-NEXT:    vsll.vv v8, v8, v20, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsrl.vi v12, v12, 1, v0.t
-; RV64-NEXT:    vsrl.vv v12, v12, v16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshl_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vand.vx v20, v16, a1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    vnot.v v16, v16, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v12, v12, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v12, v12, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
   %res = call <8 x i64> @llvm.vp.fshl.v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %res
 }
 
 declare <16 x i64> @llvm.vp.fshr.v16i64(<16 x i64>, <16 x i64>, <16 x i64>, <16 x i1>, i32)
 define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshr_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v8, v24, a0, v0.t
-; RV32-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsll.vi v24, v24, 1, v0.t
-; RV32-NEXT:    vsll.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshr_v16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v24, a0, v0.t
-; RV64-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; RV64-NEXT:    vnot.v v8, v24, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsll.vi v24, v24, 1, v0.t
-; RV64-NEXT:    vsll.vv v8, v24, v8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshr_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vnot.v v8, v24, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vi v24, v24, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
   %res = call <16 x i64> @llvm.vp.fshr.v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %res
 }
 
 declare <16 x i64> @llvm.vp.fshl.v16i64(<16 x i64>, <16 x i64>, <16 x i64>, <16 x i1>, i32)
 define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: fshl_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v16, v8
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v8, v24, a0, v0.t
-; RV32-NEXT:    vsll.vv v8, v16, v8, v0.t
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vxor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v24, v24, 1, v0.t
-; RV32-NEXT:    vsrl.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: fshl_v16i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv8r.v v16, v8
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v24, a0, v0.t
-; RV64-NEXT:    vsll.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vnot.v v16, v24, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsrl.vi v24, v24, 1, v0.t
-; RV64-NEXT:    vsrl.vv v16, v24, v16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: fshl_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
+; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vnot.v v16, v24, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vi v24, v24, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
   %res = call <16 x i64> @llvm.vp.fshl.v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 %evl)
   ret <16 x i64> %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
index 6568648f98cf57..82c160577a50ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
@@ -875,34 +875,17 @@ define <16 x i32> @vrol_vx_v16i32(<16 x i32> %a, i32 %b) {
 declare <1 x i64> @llvm.fshl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
 
 define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-RV32-LABEL: vrol_vv_v1i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v11, v10, v9
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v10, a0
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vv v11, v11, v10
-; CHECK-RV32-NEXT:    vsrl.vv v11, v8, v11
-; CHECK-RV32-NEXT:    vand.vv v9, v9, v10
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v11
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vv_v1i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vv_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vv_v1i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -914,35 +897,18 @@ define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 }
 
 define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vrol_vx_v1i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v9, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v9, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v9, a1
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vv v10, v10, v9
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vx_v1i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v9, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vx_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vx_v1i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -958,32 +924,17 @@ define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) {
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 
 define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-RV32-LABEL: vrol_vv_v2i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v11, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v9, v11, v9
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vv_v2i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vv_v2i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -995,34 +946,18 @@ define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vrol_vx_v2i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v9, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a1
-; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vx_v2i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v9, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vx_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vx_v2i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1038,32 +973,17 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
 
 define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-RV32-LABEL: vrol_vv_v4i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v14, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v10, v14, v10
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vv_v4i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
-; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vand.vx v12, v10, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vv_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1075,34 +995,18 @@ define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vrol_vx_v4i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a1
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vx_v4i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v10, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
-; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vx_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v12, v10, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vx_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1118,32 +1022,17 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
 
 define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-RV32-LABEL: vrol_vv_v8i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v20, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v12, v20, v12
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v16, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vv_v8i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
-; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vand.vx v16, v12, a0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
+; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vv_v8i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1155,34 +1044,18 @@ define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 }
 
 define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vrol_vx_v8i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a1
-; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vrol_vx_v8i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v12, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
-; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vrol_vx_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v16, v12, a0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
+; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vrol_vx_v8i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1196,5 +1069,7 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 }
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-RV32: {{.*}}
+; CHECK-RV64: {{.*}}
 ; CHECK-ZVBB32: {{.*}}
 ; CHECK-ZVBB64: {{.*}}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
index 75758d402d0321..4f26ef8fd491fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
@@ -1554,34 +1554,17 @@ declare <1 x i64> @llvm.fshr.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
 declare <1 x i64> @llvm.fshl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
 
 define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-RV32-LABEL: vror_vv_v1i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v11, v10, v9
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v10, a0
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vv v11, v11, v10
-; CHECK-RV32-NEXT:    vsll.vv v11, v8, v11
-; CHECK-RV32-NEXT:    vand.vv v9, v9, v10
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v11
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vv_v1i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vv_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vv_v1i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1593,35 +1576,18 @@ define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 }
 
 define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vror_vx_v1i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v9, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v9, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v9, a1
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vv v10, v10, v9
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vx_v1i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v9, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vx_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vx_v1i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1637,20 +1603,16 @@ define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) {
 define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_v1i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v9, 0
-; CHECK-RV32-NEXT:    li a0, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v9, a0
+; CHECK-RV32-NEXT:    vmv.v.i v9, 1
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v9, a0
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vv v10, v10, v9
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
+; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v1i64:
@@ -1674,20 +1636,16 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) {
 define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_rotl_v1i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v9, 0
-; CHECK-RV32-NEXT:    li a0, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v9, a0
+; CHECK-RV32-NEXT:    vmv.v.i v9, 1
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v9, a0
-; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vv v10, v10, v9
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
+; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v1i64:
@@ -1712,32 +1670,17 @@ declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 
 define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-RV32-LABEL: vror_vv_v2i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v11, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v9, v11, v9
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vv_v2i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vv_v2i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1749,34 +1692,18 @@ define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vror_vx_v2i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v9, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a1
-; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vx_v2i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v9, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
-; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
-; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vx_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v10, v9, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v9
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vx_v2i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1792,19 +1719,16 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
 define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_v2i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v9, a0
-; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
-; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
-; CHECK-RV32-NEXT:    li a1, 1
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v10, a1
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
+; CHECK-RV32-NEXT:    vmv.v.i v9, 1
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV32-NEXT:    li a0, 63
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
+; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v2i64:
@@ -1828,19 +1752,16 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) {
 define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_rotl_v2i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v9, a0
-; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
-; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v10, 0
-; CHECK-RV32-NEXT:    li a1, 1
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v10, v10, a1
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
+; CHECK-RV32-NEXT:    vmv.v.i v9, 1
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV32-NEXT:    li a0, 63
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
+; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v2i64:
@@ -1865,32 +1786,17 @@ declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
 
 define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-RV32-LABEL: vror_vv_v4i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v14, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v10, v14, v10
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vv_v4i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
-; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vand.vx v12, v10, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vv_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1902,34 +1808,18 @@ define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vror_vx_v4i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a1
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vx_v4i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v10, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
-; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vx_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v12, v10, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vx_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -1945,19 +1835,16 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_v4i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    li a1, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v12, v12, a1
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
+; CHECK-RV32-NEXT:    vmv.v.i v10, 1
+; CHECK-RV32-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-RV32-NEXT:    li a0, 63
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v4i64:
@@ -1981,19 +1868,16 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) {
 define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_rotl_v4i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v12, 0
-; CHECK-RV32-NEXT:    li a1, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v12, v12, a1
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
+; CHECK-RV32-NEXT:    vmv.v.i v10, 1
+; CHECK-RV32-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-RV32-NEXT:    li a0, 63
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64:
@@ -2018,32 +1902,17 @@ declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
 
 define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-RV32-LABEL: vror_vv_v8i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV32-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v20, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vsub.vv v12, v20, v12
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v16, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vv_v8i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
-; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vand.vx v16, v12, a0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
+; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vv_v8i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -2055,34 +1924,18 @@ define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 }
 
 define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
-; CHECK-RV32-LABEL: vror_vx_v8i64:
-; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    li a1, 63
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a1
-; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
-; CHECK-RV32-NEXT:    ret
-;
-; CHECK-RV64-LABEL: vror_vx_v8i64:
-; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV64-NEXT:    vmv.v.x v12, a0
-; CHECK-RV64-NEXT:    li a0, 63
-; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
-; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
-; CHECK-RV64-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
-; CHECK-RV64-NEXT:    ret
+; CHECK-LABEL: vror_vx_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vand.vx v16, v12, a0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
+; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vror_vx_v8i64:
 ; CHECK-ZVBB:       # %bb.0:
@@ -2098,19 +1951,16 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_v8i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    li a1, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v16, v16, a1
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
+; CHECK-RV32-NEXT:    vmv.v.i v12, 1
+; CHECK-RV32-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-RV32-NEXT:    li a0, 63
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v16, a0
+; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v8i64:
@@ -2134,19 +1984,16 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) {
 define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-LABEL: vror_vi_rotl_v8i64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-RV32-NEXT:    vmv.v.i v16, 0
-; CHECK-RV32-NEXT:    li a1, 1
-; CHECK-RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vsub.vx v16, v16, a1
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
+; CHECK-RV32-NEXT:    vmv.v.i v12, 1
+; CHECK-RV32-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-RV32-NEXT:    li a0, 63
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v16, a0
+; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64:


        


More information about the llvm-commits mailing list