[llvm] aae155c - [RISCV] Use a build_vector instead of a chain insert_vector_elts for vXi1 build_vector lowreing.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 21 09:00:28 PDT 2023


Author: Craig Topper
Date: 2023-06-21T08:57:46-07:00
New Revision: aae155c50b07cf2358d72055457c133f4a71e1eb

URL: https://github.com/llvm/llvm-project/commit/aae155c50b07cf2358d72055457c133f4a71e1eb
DIFF: https://github.com/llvm/llvm-project/commit/aae155c50b07cf2358d72055457c133f4a71e1eb.diff

LOG: [RISCV] Use a build_vector instead of a chain insert_vector_elts for vXi1 build_vector lowreing.

A build_vector is the canonical representation rather than multiple
insert_vector_elts.

Unfortunately, this regresses quite a few tests now primarily due to not
having a vmv.s.x special case, but I hope we can improve this with future
patches.

Stress testing in our downstream found an infinite loop in DAG combine.
This patch breaks the infinite loop.

The insert_vector_element chain starts with a fixed vector undef.
Fixed vector undef is currently expanded to a build_vector of 0s
which gets lowered to a vmv.v.i. The insert chain overwrites all
elements so SimplifyDemandedVectorElts turns the vmv.v.i back into
undef and the cycle repeats.

We probably should custom lower fixed vector undef to scalable
vector undef. I think that would also fix the infinite loop, but
I didn't test that.

Reviewed By: luke

Differential Revision: https://reviews.llvm.org/D153399

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
    llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
    llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
    llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f68b49d6d3d25..5fab317a4279b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2965,13 +2965,14 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         return SDValue();
       // Now we can create our integer vector type. Note that it may be larger
       // than the resulting mask type: v4i1 would use v1i8 as its integer type.
+      unsigned IntegerViaVecElts = divideCeil(NumElts, NumViaIntegerBits);
       MVT IntegerViaVecVT =
           MVT::getVectorVT(MVT::getIntegerVT(NumViaIntegerBits),
-                           divideCeil(NumElts, NumViaIntegerBits));
+                           IntegerViaVecElts);
 
       uint64_t Bits = 0;
       unsigned BitPos = 0, IntegerEltIdx = 0;
-      SDValue Vec = DAG.getUNDEF(IntegerViaVecVT);
+      SmallVector<SDValue, 8> Elts(IntegerViaVecElts);
 
       for (unsigned I = 0; I < NumElts;) {
         SDValue V = Op.getOperand(I);
@@ -2986,14 +2987,15 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           if (NumViaIntegerBits <= 32)
             Bits = SignExtend64<32>(Bits);
           SDValue Elt = DAG.getConstant(Bits, DL, XLenVT);
-          Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntegerViaVecVT, Vec,
-                            Elt, DAG.getConstant(IntegerEltIdx, DL, XLenVT));
+          Elts[IntegerEltIdx] = Elt;
           Bits = 0;
           BitPos = 0;
           IntegerEltIdx++;
         }
       }
 
+      SDValue Vec = DAG.getBuildVector(IntegerViaVecVT, DL, Elts);
+
       if (NumElts < NumViaIntegerBits) {
         // If we're producing a smaller vector than our minimum legal integer
         // type, bitcast to the equivalent (known-legal) mask type, and extract

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
index deceb6f89c9a4..4f29bc48228d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
@@ -11,7 +11,7 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -27,7 +27,7 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -42,12 +42,12 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vid.v v11
 ; RV32-NEXT:    vrgather.vv v10, v8, v11
+; RV32-NEXT:    vadd.vi v8, v11, -1
 ; RV32-NEXT:    lui a0, 11
 ; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; RV32-NEXT:    vadd.vi v8, v11, -1
 ; RV32-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
@@ -57,12 +57,12 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vid.v v11
 ; RV64-NEXT:    vrgather.vv v10, v8, v11
+; RV64-NEXT:    vadd.vi v8, v11, -1
 ; RV64-NEXT:    lui a0, 11
 ; RV64-NEXT:    addiw a0, a0, -1366
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; RV64-NEXT:    vadd.vi v8, v11, -1
 ; RV64-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret
@@ -80,7 +80,7 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; RV32-NEXT:    lui a0, 11
 ; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
@@ -95,7 +95,7 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; RV64-NEXT:    lui a0, 11
 ; RV64-NEXT:    addiw a0, a0, -1366
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v10
@@ -110,8 +110,7 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -126,8 +125,7 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -143,7 +141,7 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -159,7 +157,7 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -181,9 +179,8 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: trn2.v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -198,8 +195,7 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -214,8 +210,7 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -237,9 +232,8 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: trn2.v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -261,9 +255,8 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
 define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: trn2.v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -278,8 +271,7 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -294,8 +286,7 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -317,9 +308,8 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
 define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: trn2.v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -334,8 +324,7 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -350,8 +339,7 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
-; CHECK-NEXT:    li a0, 10
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 10
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -367,7 +355,7 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -383,7 +371,7 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
index 311491fa6018c..bd59f28dd4eba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
@@ -58,7 +58,7 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; V-NEXT:    lui a3, 983765
 ; V-NEXT:    addiw a3, a3, 873
 ; V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; V-NEXT:    vmv.s.x v0, a3
+; V-NEXT:    vmv.v.x v0, a3
 ; V-NEXT:    li a3, 32
 ; V-NEXT:    li a4, 5
 ; V-NEXT:  .LBB1_1: # %vector.body
@@ -82,7 +82,7 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; ZVE32F-NEXT:    lui a3, 983765
 ; ZVE32F-NEXT:    addiw a3, a3, 873
 ; ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a3
+; ZVE32F-NEXT:    vmv.v.x v0, a3
 ; ZVE32F-NEXT:    li a3, 32
 ; ZVE32F-NEXT:    li a4, 5
 ; ZVE32F-NEXT:  .LBB1_1: # %vector.body
@@ -333,7 +333,7 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; V-NEXT:    lui a4, 983765
 ; V-NEXT:    addiw a4, a4, 873
 ; V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; V-NEXT:    vmv.s.x v0, a4
+; V-NEXT:    vmv.v.x v0, a4
 ; V-NEXT:    li a4, 5
 ; V-NEXT:  .LBB6_1: # %vector.body
 ; V-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -357,7 +357,7 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; ZVE32F-NEXT:    lui a4, 983765
 ; ZVE32F-NEXT:    addiw a4, a4, 873
 ; ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a4
+; ZVE32F-NEXT:    vmv.v.x v0, a4
 ; ZVE32F-NEXT:    li a4, 5
 ; ZVE32F-NEXT:  .LBB6_1: # %vector.body
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 65e915b978527..16e0114511fd4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1395,30 +1395,30 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v11, v11, a3, v0.t
 ; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
-; RV32-NEXT:    vsrl.vi v11, v8, 24, v0.t
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v11, v11, a4, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    vsrl.vi v11, v8, 8, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vmv.v.i v13, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v13, v13, a5, v0
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vand.vv v12, v12, v13, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v11, v12, v0.t
+; RV32-NEXT:    vsrl.vi v13, v8, 24, v0.t
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v13, v13, a4, v0.t
+; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
 ; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV32-NEXT:    vsll.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v13, v0.t
+; RV32-NEXT:    vand.vx v13, v8, a3, v0.t
+; RV32-NEXT:    vsll.vx v13, v13, a2, v0.t
+; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
+; RV32-NEXT:    vand.vx v13, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v13, v13, 24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v13, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v11, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -1528,30 +1528,30 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v10, v10, a3
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 8
-; RV32-NEXT:    li a4, 5
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vmv.v.i v11, 0
+; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v11, v11, a4, v0
+; RV32-NEXT:    vmerge.vxm v10, v10, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v11
+; RV32-NEXT:    vsrl.vi v11, v8, 8
+; RV32-NEXT:    vand.vv v11, v11, v10
 ; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
+; RV32-NEXT:    vor.vv v11, v11, v12
+; RV32-NEXT:    vor.vv v9, v11, v9
+; RV32-NEXT:    vsll.vx v11, v8, a1
 ; RV32-NEXT:    vand.vx v12, v8, a3
 ; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v11, v11, v12
+; RV32-NEXT:    vand.vv v10, v8, v10
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    lui a1, 61681
@@ -1665,30 +1665,31 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v14, v14, a3, v0.t
 ; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
-; RV32-NEXT:    vsrl.vi v14, v8, 24, v0.t
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v14, v14, a4, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    li a5, 85
+; RV32-NEXT:    vsrl.vi v14, v8, 8, v0.t
+; RV32-NEXT:    li a4, 85
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vmv.v.i v18, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v18, v18, a5, v0
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vand.vv v16, v16, v18, v0.t
-; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v14, v16, v0.t
+; RV32-NEXT:    vsrl.vi v18, v8, 24, v0.t
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v18, v18, a4, v0.t
+; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
 ; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV32-NEXT:    vsll.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v16, v16, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v18, v0.t
+; RV32-NEXT:    vand.vx v18, v8, a3, v0.t
+; RV32-NEXT:    vsll.vx v18, v18, a2, v0.t
+; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
+; RV32-NEXT:    vand.vx v18, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v18, v18, 24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v18, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v14, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
@@ -1798,30 +1799,31 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v12, v12, a3
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    li a4, 85
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vmv.v.i v14, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v14, v14, a4, v0
+; RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v14
+; RV32-NEXT:    vsrl.vi v14, v8, 8
+; RV32-NEXT:    vand.vv v14, v14, v12
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vor.vv v14, v14, v16
+; RV32-NEXT:    vor.vv v10, v14, v10
+; RV32-NEXT:    vsll.vx v14, v8, a1
 ; RV32-NEXT:    vand.vx v16, v8, a3
 ; RV32-NEXT:    vsll.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a4
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v14, v14, v16
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v14, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    lui a1, 61681
@@ -1934,34 +1936,35 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v20, v20, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v20, v16, v0.t
-; RV32-NEXT:    vsrl.vi v20, v8, 24, v0.t
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v24, v20, a4, v0.t
-; RV32-NEXT:    vsrl.vi v28, v8, 8, v0.t
-; RV32-NEXT:    lui a5, 5
-; RV32-NEXT:    addi a5, a5, 1365
+; RV32-NEXT:    vor.vv v20, v20, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT:    lui a4, 5
+; RV32-NEXT:    addi a4, a4, 1365
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vmv.v.i v20, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v20, v20, a5, v0
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vand.vv v28, v28, v20, v0.t
-; RV32-NEXT:    vor.vv v24, v28, v24, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v28, v8, 24, v0.t
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v28, v28, a4, v0.t
+; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
+; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
 ; RV32-NEXT:    vand.vx v28, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vx v28, v28, a2, v0.t
 ; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
 ; RV32-NEXT:    vand.vx v28, v8, a4, v0.t
 ; RV32-NEXT:    vsll.vi v28, v28, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vor.vv v8, v28, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
@@ -2069,15 +2072,16 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v16, v16, a3
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    lui a4, 5
 ; RV32-NEXT:    addi a4, a4, 1365
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    lui a4, 1044480
 ; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    vand.vv v20, v20, v16
 ; RV32-NEXT:    vsrl.vi v24, v8, 24
 ; RV32-NEXT:    lui a4, 4080
@@ -2088,11 +2092,11 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    vand.vx v24, v8, a3
 ; RV32-NEXT:    vsll.vx v24, v24, a2
 ; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vand.vx v24, v8, a4
-; RV32-NEXT:    vsll.vi v24, v24, 24
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v20, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
@@ -2200,140 +2204,245 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vmv1r.v v1, v0
 ; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a5, a1, -256
-; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV32-NEXT:    addi a4, a1, -256
+; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    vsll.vx v24, v24, a5, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    lui a6, 4080
-; RV32-NEXT:    vand.vx v24, v24, a6, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a6, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vmv.v.i v24, 0
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    lui a7, 1044480
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmerge.vxm v16, v24, a7, v0
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t0, 24
-; RV32-NEXT:    mul a7, a7, t0
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
+; RV32-NEXT:    addi a7, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    addi a7, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t0, 24
-; RV32-NEXT:    mul a7, a7, t0
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vl8r.v v16, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    csrr a7, vlenb
 ; RV32-NEXT:    slli a7, a7, 3
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a7, vlenb
 ; RV32-NEXT:    slli a7, a7, 4
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a7, vlenb
 ; RV32-NEXT:    slli a7, a7, 4
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a5, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a6, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a5, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a4, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v16, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 4, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v24, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v24, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v24, 1, v0.t
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -2427,73 +2536,70 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    li a5, 32
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    lui a6, 1044480
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vxm v16, v16, a6, v0
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    lui a3, 1044480
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v16, v16, a3, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 8
 ; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    lui a6, 4080
 ; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    vand.vx v0, v0, a6
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    li a4, 56
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    vsrl.vx v0, v8, a5
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    addi a6, a6, -256
+; RV32-NEXT:    vand.vx v0, v0, a6
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    addi a7, sp, 16
 ; RV32-NEXT:    vl8r.v v0, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a6
-; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vand.vx v0, v8, a3
+; RV32-NEXT:    vsll.vi v0, v0, 24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vsll.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v8, v8, a6
+; RV32-NEXT:    vsll.vx v8, v8, a5
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
@@ -2578,140 +2684,245 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vmv1r.v v1, v0
 ; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a5, a1, -256
-; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV32-NEXT:    addi a4, a1, -256
+; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    vsll.vx v24, v24, a5, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    lui a6, 4080
-; RV32-NEXT:    vand.vx v24, v24, a6, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a6, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vmv.v.i v24, 0
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    lui a7, 1044480
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmerge.vxm v16, v24, a7, v0
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t0, 24
-; RV32-NEXT:    mul a7, a7, t0
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
+; RV32-NEXT:    addi a7, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    addi a7, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t0, 24
-; RV32-NEXT:    mul a7, a7, t0
-; RV32-NEXT:    add a7, sp, a7
-; RV32-NEXT:    addi a7, a7, 16
-; RV32-NEXT:    vl8r.v v16, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    csrr a7, vlenb
 ; RV32-NEXT:    slli a7, a7, 3
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a7, vlenb
 ; RV32-NEXT:    slli a7, a7, 4
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a7, vlenb
 ; RV32-NEXT:    slli a7, a7, 4
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a5, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a6, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a5, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a4, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v16, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 4, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v24, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v24, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v24, 1, v0.t
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -2805,73 +3016,70 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    li a5, 32
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    lui a6, 1044480
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vxm v16, v16, a6, v0
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    lui a3, 1044480
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v16, v16, a3, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 8
 ; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    lui a6, 4080
 ; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    vand.vx v0, v0, a6
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    li a4, 56
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    vsrl.vx v0, v8, a5
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    addi a6, a6, -256
+; RV32-NEXT:    vand.vx v0, v0, a6
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    addi a7, sp, 16
 ; RV32-NEXT:    vl8r.v v0, (a7) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a6
-; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vand.vx v0, v8, a3
+; RV32-NEXT:    vsll.vi v0, v0, 24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vsll.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v8, v8, a6
+; RV32-NEXT:    vsll.vx v8, v8, a5
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi a3, a3, -241
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index 0dbbe024ddd23..6fb287d440655 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -165,39 +165,38 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    vmv.s.x v0, a5
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 8
+; RV32-NEXT:    vand.vv v10, v10, v9
+; RV32-NEXT:    vsrl.vi v11, v8, 24
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v11, v11, a1
+; RV32-NEXT:    vor.vv v10, v10, v11
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsrl.vx v12, v8, a3
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    addi a4, a4, -256
+; RV32-NEXT:    vand.vx v12, v12, a4
+; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vor.vv v10, v10, v11
+; RV32-NEXT:    vand.vv v9, v8, v9
+; RV32-NEXT:    vsll.vi v9, v9, 8
+; RV32-NEXT:    vand.vx v11, v8, a1
+; RV32-NEXT:    vsll.vi v11, v11, 24
+; RV32-NEXT:    vor.vv v9, v11, v9
+; RV32-NEXT:    vsll.vx v11, v8, a2
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vx v8, v8, a3
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
@@ -697,39 +696,40 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 56
-; LMULMAX2-RV32-NEXT:    vsrl.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    li a2, 40
-; LMULMAX2-RV32-NEXT:    vsrl.vx v12, v8, a2
-; LMULMAX2-RV32-NEXT:    lui a3, 16
-; LMULMAX2-RV32-NEXT:    addi a3, a3, -256
-; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a3
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 24
-; LMULMAX2-RV32-NEXT:    lui a4, 4080
-; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a4
-; LMULMAX2-RV32-NEXT:    li a5, 85
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a5
+; LMULMAX2-RV32-NEXT:    li a1, 85
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v14, 0
-; LMULMAX2-RV32-NEXT:    lui a5, 1044480
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v14, v14, a5, v0
+; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32-NEXT:    lui a1, 1044480
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    vsll.vx v12, v8, a1
-; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a3
-; LMULMAX2-RV32-NEXT:    vsll.vx v16, v16, a2
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v16
-; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a4
-; LMULMAX2-RV32-NEXT:    vsll.vi v16, v16, 24
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v16, v8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v12, v8
+; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 8
+; LMULMAX2-RV32-NEXT:    vand.vv v12, v12, v10
+; LMULMAX2-RV32-NEXT:    vsrl.vi v14, v8, 24
+; LMULMAX2-RV32-NEXT:    lui a1, 4080
+; LMULMAX2-RV32-NEXT:    vand.vx v14, v14, a1
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v14
+; LMULMAX2-RV32-NEXT:    li a2, 56
+; LMULMAX2-RV32-NEXT:    vsrl.vx v14, v8, a2
+; LMULMAX2-RV32-NEXT:    li a3, 40
+; LMULMAX2-RV32-NEXT:    vsrl.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT:    lui a4, 16
+; LMULMAX2-RV32-NEXT:    addi a4, a4, -256
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v16, a4
+; LMULMAX2-RV32-NEXT:    vor.vv v14, v16, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v14
+; LMULMAX2-RV32-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV32-NEXT:    vsll.vi v10, v10, 8
+; LMULMAX2-RV32-NEXT:    vand.vx v14, v8, a1
+; LMULMAX2-RV32-NEXT:    vsll.vi v14, v14, 24
+; LMULMAX2-RV32-NEXT:    vor.vv v10, v14, v10
+; LMULMAX2-RV32-NEXT:    vsll.vx v14, v8, a2
+; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a4
+; LMULMAX2-RV32-NEXT:    vsll.vx v8, v8, a3
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v14, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
@@ -828,41 +828,40 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; LMULMAX1-RV32-LABEL: bitreverse_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v10, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v10, a2
-; LMULMAX1-RV32-NEXT:    li a3, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v10, a3
-; LMULMAX1-RV32-NEXT:    lui a4, 16
-; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v10, 24
-; LMULMAX1-RV32-NEXT:    lui a5, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v12, v9, a5
-; LMULMAX1-RV32-NEXT:    li a6, 5
-; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a6
+; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; LMULMAX1-RV32-NEXT:    vmv.v.i v9, 0
-; LMULMAX1-RV32-NEXT:    lui a6, 1044480
-; LMULMAX1-RV32-NEXT:    vmerge.vxm v9, v9, a6, v0
+; LMULMAX1-RV32-NEXT:    lui a2, 1044480
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v9, v9, a2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v10, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v10, 24
+; LMULMAX1-RV32-NEXT:    lui a2, 4080
+; LMULMAX1-RV32-NEXT:    vand.vx v12, v12, a2
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    li a3, 56
+; LMULMAX1-RV32-NEXT:    vsrl.vx v12, v10, a3
+; LMULMAX1-RV32-NEXT:    li a4, 40
+; LMULMAX1-RV32-NEXT:    vsrl.vx v13, v10, a4
+; LMULMAX1-RV32-NEXT:    lui a5, 16
+; LMULMAX1-RV32-NEXT:    addi a5, a5, -256
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
 ; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v12, v11
-; LMULMAX1-RV32-NEXT:    vsll.vx v12, v10, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v10, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v10, a5
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v12, v10, v9
+; LMULMAX1-RV32-NEXT:    vsll.vi v12, v12, 8
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v10, a2
 ; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v9
-; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v10, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a5
+; LMULMAX1-RV32-NEXT:    vsll.vx v10, v10, a4
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v13, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v12
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 4
 ; LMULMAX1-RV32-NEXT:    lui a6, 61681
@@ -894,26 +893,26 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v14
 ; LMULMAX1-RV32-NEXT:    vadd.vv v10, v10, v10
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vx v15, v8, a3
-; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v15, v11
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v9
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v8, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v16, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v16, v16, v9
+; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a2
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v15
+; LMULMAX1-RV32-NEXT:    vsrl.vx v15, v8, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v16, v8, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v16, v16, a5
 ; LMULMAX1-RV32-NEXT:    vor.vv v15, v16, v15
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v15, v11
-; LMULMAX1-RV32-NEXT:    vsll.vx v15, v8, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v16, v8, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v16, v16, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT:    vand.vx v16, v8, a5
-; LMULMAX1-RV32-NEXT:    vsll.vi v16, v16, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v15
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v8, v9
+; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 8
+; LMULMAX1-RV32-NEXT:    vand.vx v15, v8, a2
+; LMULMAX1-RV32-NEXT:    vsll.vi v15, v15, 24
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v15, v9
+; LMULMAX1-RV32-NEXT:    vsll.vx v15, v8, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
+; LMULMAX1-RV32-NEXT:    vsll.vx v8, v8, a4
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v15, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
 ; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v12

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index 590c631c541e2..7c40108e340ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -435,30 +435,30 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v11, v11, a3, v0.t
 ; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
-; RV32-NEXT:    vsrl.vi v11, v8, 24, v0.t
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v11, v11, a4, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    vsrl.vi v11, v8, 8, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vmv.v.i v13, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v13, v13, a5, v0
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v9
-; RV32-NEXT:    vand.vv v12, v12, v13, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vand.vv v11, v11, v12, v0.t
+; RV32-NEXT:    vsrl.vi v13, v8, 24, v0.t
+; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    vand.vx v13, v13, a0, v0.t
+; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
 ; RV32-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV32-NEXT:    vsll.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vand.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v13, v0.t
+; RV32-NEXT:    vand.vx v13, v8, a3, v0.t
+; RV32-NEXT:    vsll.vx v13, v13, a2, v0.t
+; RV32-NEXT:    vor.vv v11, v11, v13, v0.t
+; RV32-NEXT:    vand.vx v13, v8, a0, v0.t
+; RV32-NEXT:    vsll.vi v13, v13, 24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v13, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v11, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    ret
@@ -511,30 +511,30 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v10, v10, a3
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 8
-; RV32-NEXT:    li a4, 5
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vmv.v.i v11, 0
+; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v11, v11, a4, v0
+; RV32-NEXT:    vmerge.vxm v10, v10, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v11
+; RV32-NEXT:    vsrl.vi v11, v8, 8
+; RV32-NEXT:    vand.vv v11, v11, v10
 ; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    lui a0, 4080
 ; RV32-NEXT:    vand.vx v12, v12, a0
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
+; RV32-NEXT:    vor.vv v11, v11, v12
+; RV32-NEXT:    vor.vv v9, v11, v9
+; RV32-NEXT:    vsll.vx v11, v8, a1
 ; RV32-NEXT:    vand.vx v12, v8, a3
 ; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a0
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v11, v11, v12
+; RV32-NEXT:    vand.vv v10, v8, v10
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vand.vx v8, v8, a0
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    ret
 ;
@@ -591,30 +591,31 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v14, v14, a3, v0.t
 ; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
-; RV32-NEXT:    vsrl.vi v14, v8, 24, v0.t
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v14, v14, a4, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT:    li a5, 85
+; RV32-NEXT:    vsrl.vi v14, v8, 8, v0.t
+; RV32-NEXT:    li a4, 85
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vmv.v.i v18, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v18, v18, a5, v0
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vand.vv v16, v16, v18, v0.t
-; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vand.vv v14, v14, v16, v0.t
+; RV32-NEXT:    vsrl.vi v18, v8, 24, v0.t
+; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    vand.vx v18, v18, a0, v0.t
+; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
 ; RV32-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV32-NEXT:    vsll.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v16, v16, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v18, v0.t
+; RV32-NEXT:    vand.vx v18, v8, a3, v0.t
+; RV32-NEXT:    vsll.vx v18, v18, a2, v0.t
+; RV32-NEXT:    vor.vv v14, v14, v18, v0.t
+; RV32-NEXT:    vand.vx v18, v8, a0, v0.t
+; RV32-NEXT:    vsll.vi v18, v18, 24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v18, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v14, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    ret
@@ -667,30 +668,31 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v12, v12, a3
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    li a4, 85
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vmv.v.i v14, 0
+; RV32-NEXT:    vmv.v.i v12, 0
 ; RV32-NEXT:    lui a4, 1044480
-; RV32-NEXT:    vmerge.vxm v14, v14, a4, v0
+; RV32-NEXT:    vmerge.vxm v12, v12, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v14
+; RV32-NEXT:    vsrl.vi v14, v8, 8
+; RV32-NEXT:    vand.vv v14, v14, v12
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    lui a0, 4080
 ; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vor.vv v14, v14, v16
+; RV32-NEXT:    vor.vv v10, v14, v10
+; RV32-NEXT:    vsll.vx v14, v8, a1
 ; RV32-NEXT:    vand.vx v16, v8, a3
 ; RV32-NEXT:    vsll.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a0
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v14, v14, v16
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vand.vx v8, v8, a0
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v14, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
@@ -746,34 +748,35 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v20, v20, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v20, v16, v0.t
-; RV32-NEXT:    vsrl.vi v20, v8, 24, v0.t
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v24, v20, a4, v0.t
-; RV32-NEXT:    vsrl.vi v28, v8, 8, v0.t
-; RV32-NEXT:    lui a5, 5
-; RV32-NEXT:    addi a5, a5, 1365
+; RV32-NEXT:    vor.vv v20, v20, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT:    lui a4, 5
+; RV32-NEXT:    addi a4, a4, 1365
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vmv.v.i v20, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v20, v20, a5, v0
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vand.vv v28, v28, v20, v0.t
-; RV32-NEXT:    vor.vv v24, v28, v24, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v28, v8, 24, v0.t
+; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    vand.vx v28, v28, a0, v0.t
+; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
+; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
 ; RV32-NEXT:    vand.vx v28, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vx v28, v28, a2, v0.t
 ; RV32-NEXT:    vor.vv v24, v24, v28, v0.t
-; RV32-NEXT:    vand.vx v28, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v28, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vi v28, v28, 24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v20, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vor.vv v8, v28, v8, v0.t
 ; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_bswap_v8i64:
@@ -824,15 +827,16 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vand.vx v16, v16, a3
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    lui a4, 5
 ; RV32-NEXT:    addi a4, a4, 1365
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    lui a4, 1044480
 ; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    vand.vv v20, v20, v16
 ; RV32-NEXT:    vsrl.vi v24, v8, 24
 ; RV32-NEXT:    lui a0, 4080
@@ -843,11 +847,11 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vand.vx v24, v8, a3
 ; RV32-NEXT:    vsll.vx v24, v24, a2
 ; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vand.vx v24, v8, a0
-; RV32-NEXT:    vsll.vi v24, v24, 24
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vand.vx v8, v8, a0
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v20, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
@@ -898,112 +902,127 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vmv1r.v v1, v0
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 3
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a5, 349525
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    addi a6, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a6) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    li a6, 32
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; RV32-NEXT:    lui a5, 1044480
+; RV32-NEXT:    li a5, 32
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    lui a6, 349525
+; RV32-NEXT:    addi a6, a6, 1365
 ; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmerge.vxm v16, v24, a5, v0
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 24
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    lui a7, 1044480
+; RV32-NEXT:    vmv.v.x v0, a6
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a0, a0, a5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -1070,50 +1089,47 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    li a4, 32
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v16, v16, a2, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 8
 ; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    lui a0, 4080
 ; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    lui a0, 4080
 ; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    addi a3, a3, -256
+; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vand.vx v0, v8, a0
+; RV32-NEXT:    vsll.vi v0, v0, 24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vsll.vx v0, v8, a1
+; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vsll.vx v8, v8, a2
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
@@ -1166,112 +1182,127 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    vmv1r.v v1, v0
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v24, v8, a1, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 3
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a5, 349525
-; RV32-NEXT:    addi a5, a5, 1365
-; RV32-NEXT:    vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT:    addi a6, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a6) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    li a6, 32
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; RV32-NEXT:    lui a5, 1044480
+; RV32-NEXT:    li a5, 32
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    lui a6, 349525
+; RV32-NEXT:    addi a6, a6, 1365
 ; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmerge.vxm v16, v24, a5, v0
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 24
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    lui a7, 1044480
+; RV32-NEXT:    vmv.v.x v0, a6
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v24, v24, a7, v0
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v1
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a0, a0, a5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT:    vand.vx v16, v24, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -1338,50 +1369,47 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    li a4, 32
-; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vxm v16, v16, a2, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 8
 ; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    lui a0, 4080
 ; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    lui a0, 4080
 ; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    addi a3, a3, -256
+; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a3
-; RV32-NEXT:    vsll.vx v0, v0, a2
-; RV32-NEXT:    vsll.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vand.vx v0, v8, a0
+; RV32-NEXT:    vsll.vi v0, v0, 24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vsll.vx v0, v8, a1
+; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vsll.vx v8, v8, a2
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 03d8ca8e2832d..8f4ad2a7c86e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -71,39 +71,38 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    vmv.s.x v0, a5
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    lui a5, 1044480
-; RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 8
+; RV32-NEXT:    vand.vv v10, v10, v9
+; RV32-NEXT:    vsrl.vi v11, v8, 24
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v11, v11, a1
+; RV32-NEXT:    vor.vv v10, v10, v11
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    vsrl.vx v12, v8, a3
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    addi a4, a4, -256
+; RV32-NEXT:    vand.vx v12, v12, a4
+; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vor.vv v10, v10, v11
+; RV32-NEXT:    vand.vv v9, v8, v9
+; RV32-NEXT:    vsll.vi v9, v9, 8
+; RV32-NEXT:    vand.vx v11, v8, a1
+; RV32-NEXT:    vsll.vi v11, v11, 24
+; RV32-NEXT:    vor.vv v9, v11, v9
+; RV32-NEXT:    vsll.vx v11, v8, a2
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vx v8, v8, a3
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -318,39 +317,40 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 56
-; LMULMAX2-RV32-NEXT:    vsrl.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    li a2, 40
-; LMULMAX2-RV32-NEXT:    vsrl.vx v12, v8, a2
-; LMULMAX2-RV32-NEXT:    lui a3, 16
-; LMULMAX2-RV32-NEXT:    addi a3, a3, -256
-; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a3
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 24
-; LMULMAX2-RV32-NEXT:    lui a4, 4080
-; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a4
-; LMULMAX2-RV32-NEXT:    li a5, 85
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a5
+; LMULMAX2-RV32-NEXT:    li a1, 85
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v14, 0
-; LMULMAX2-RV32-NEXT:    lui a5, 1044480
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v14, v14, a5, v0
+; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32-NEXT:    lui a1, 1044480
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    vsll.vx v12, v8, a1
-; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a3
-; LMULMAX2-RV32-NEXT:    vsll.vx v16, v16, a2
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v16
-; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a4
-; LMULMAX2-RV32-NEXT:    vsll.vi v16, v16, 24
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v16, v8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v12, v8
+; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 8
+; LMULMAX2-RV32-NEXT:    vand.vv v12, v12, v10
+; LMULMAX2-RV32-NEXT:    vsrl.vi v14, v8, 24
+; LMULMAX2-RV32-NEXT:    lui a1, 4080
+; LMULMAX2-RV32-NEXT:    vand.vx v14, v14, a1
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v14
+; LMULMAX2-RV32-NEXT:    li a2, 56
+; LMULMAX2-RV32-NEXT:    vsrl.vx v14, v8, a2
+; LMULMAX2-RV32-NEXT:    li a3, 40
+; LMULMAX2-RV32-NEXT:    vsrl.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT:    lui a4, 16
+; LMULMAX2-RV32-NEXT:    addi a4, a4, -256
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v16, a4
+; LMULMAX2-RV32-NEXT:    vor.vv v14, v16, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v14
+; LMULMAX2-RV32-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV32-NEXT:    vsll.vi v10, v10, 8
+; LMULMAX2-RV32-NEXT:    vand.vx v14, v8, a1
+; LMULMAX2-RV32-NEXT:    vsll.vi v14, v14, 24
+; LMULMAX2-RV32-NEXT:    vor.vv v10, v14, v10
+; LMULMAX2-RV32-NEXT:    vsll.vx v14, v8, a2
+; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a4
+; LMULMAX2-RV32-NEXT:    vsll.vx v8, v8, a3
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v14, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -392,65 +392,64 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; LMULMAX1-RV32-LABEL: bswap_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT:    li a3, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a3
-; LMULMAX1-RV32-NEXT:    lui a4, 16
-; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 24
-; LMULMAX1-RV32-NEXT:    lui a5, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
-; LMULMAX1-RV32-NEXT:    li a6, 5
-; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a6
+; LMULMAX1-RV32-NEXT:    vle64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v12, 0
-; LMULMAX1-RV32-NEXT:    lui a6, 1044480
-; LMULMAX1-RV32-NEXT:    vmerge.vxm v12, v12, a6, v0
+; LMULMAX1-RV32-NEXT:    vmv.v.i v10, 0
+; LMULMAX1-RV32-NEXT:    lui a2, 1044480
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v10, v10, a2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsll.vx v11, v8, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v13
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v9, 24
+; LMULMAX1-RV32-NEXT:    lui a2, 4080
+; LMULMAX1-RV32-NEXT:    vand.vx v12, v12, a2
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    li a3, 56
+; LMULMAX1-RV32-NEXT:    vsrl.vx v12, v9, a3
+; LMULMAX1-RV32-NEXT:    li a4, 40
+; LMULMAX1-RV32-NEXT:    vsrl.vx v13, v9, a4
+; LMULMAX1-RV32-NEXT:    lui a5, 16
+; LMULMAX1-RV32-NEXT:    addi a5, a5, -256
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
+; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v12, v9, v10
+; LMULMAX1-RV32-NEXT:    vsll.vi v12, v12, 8
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a2
 ; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
-; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v13, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v11, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v9, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v9, a3
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v9, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsll.vx v11, v9, a2
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a4
-; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v13
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a5
-; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v12
-; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v9, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a5
+; LMULMAX1-RV32-NEXT:    vsll.vx v9, v9, a4
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v13, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v11, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a1)
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v12, v8, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v12, v12, a2
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    vsrl.vx v12, v8, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v13, v8, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
+; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v8, v10
+; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 8
+; LMULMAX1-RV32-NEXT:    vand.vx v12, v8, a2
+; LMULMAX1-RV32-NEXT:    vsll.vi v12, v12, 24
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
+; LMULMAX1-RV32-NEXT:    vsll.vx v12, v8, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
+; LMULMAX1-RV32-NEXT:    vsll.vx v8, v8, a4
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v12, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
+; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
+; LMULMAX1-RV32-NEXT:    vse64.v v9, (a1)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v4i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 13ac5fd3d4860..740ad95532316 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -35,9 +35,8 @@ define void @buildvec_no_vid_v4f32(<4 x float>* %x) {
 define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize {
 ; LMULMAX1-LABEL: hang_when_merging_stores_after_legalization:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    li a0, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX1-NEXT:    vmv.s.x v0, a0
+; LMULMAX1-NEXT:    vmv.v.i v0, 2
 ; LMULMAX1-NEXT:    vrgather.vi v12, v8, 0
 ; LMULMAX1-NEXT:    vrgather.vi v12, v9, 3, v0.t
 ; LMULMAX1-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
@@ -49,14 +48,15 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ;
 ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization:
 ; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-NEXT:    vid.v v12
 ; LMULMAX2-NEXT:    li a0, 7
 ; LMULMAX2-NEXT:    vmul.vx v14, v12, a0
 ; LMULMAX2-NEXT:    vrgather.vv v12, v8, v14
-; LMULMAX2-NEXT:    li a0, 12
-; LMULMAX2-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-NEXT:    vadd.vi v8, v14, -14
+; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-NEXT:    vmv.v.i v0, 12
+; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; LMULMAX2-NEXT:    vrgather.vv v12, v10, v8, v0.t
 ; LMULMAX2-NEXT:    vmv1r.v v8, v12
 ; LMULMAX2-NEXT:    ret
@@ -150,10 +150,9 @@ define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) {
 define void @buildvec_merge0_v4f32(<4 x float>* %x, float %f) {
 ; CHECK-LABEL: buildvec_merge0_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 6
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vmv.v.i v0, 6
 ; CHECK-NEXT:    lui a1, 262144
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-NEXT:    vse32.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 4b9cad69082b9..db7305ce1f784 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -42,10 +42,11 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV32-V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32-V128-NEXT:    vid.v v9
 ; RV32-V128-NEXT:    vsrl.vi v14, v9, 1
-; RV32-V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; RV32-V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV32-V128-NEXT:    vrgatherei16.vv v10, v8, v14
-; RV32-V128-NEXT:    li a0, 10
-; RV32-V128-NEXT:    vmv.s.x v0, a0
+; RV32-V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v0, 10
+; RV32-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV32-V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
 ; RV32-V128-NEXT:    vmv.v.v v8, v10
 ; RV32-V128-NEXT:    ret
@@ -53,12 +54,13 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV64-V128-LABEL: interleave_v2f64:
 ; RV64-V128:       # %bb.0:
 ; RV64-V128-NEXT:    vmv1r.v v12, v9
-; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-V128-NEXT:    vid.v v10
 ; RV64-V128-NEXT:    vsrl.vi v14, v10, 1
 ; RV64-V128-NEXT:    vrgather.vv v10, v8, v14
-; RV64-V128-NEXT:    li a0, 10
-; RV64-V128-NEXT:    vmv.s.x v0, a0
+; RV64-V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-V128-NEXT:    vmv.v.i v0, 10
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV64-V128-NEXT:    vrgather.vv v10, v12, v14, v0.t
 ; RV64-V128-NEXT:    vmv.v.v v8, v10
 ; RV64-V128-NEXT:    ret
@@ -69,8 +71,7 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV32-V512-NEXT:    vid.v v10
 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-V512-NEXT:    li a0, 10
-; RV32-V512-NEXT:    vmv.s.x v0, a0
+; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
 ; RV32-V512-NEXT:    vmv.v.v v8, v10
@@ -81,8 +82,7 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; RV64-V512-NEXT:    vid.v v10
 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV64-V512-NEXT:    li a0, 10
-; RV64-V512-NEXT:    vmv.s.x v0, a0
+; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-V512-NEXT:    vmv.v.v v8, v10
@@ -255,56 +255,48 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; RV32-V128-NEXT:    addi sp, sp, -16
 ; RV32-V128-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    li a1, 24
-; RV32-V128-NEXT:    mul a0, a0, a1
+; RV32-V128-NEXT:    slli a0, a0, 4
 ; RV32-V128-NEXT:    sub sp, sp, a0
-; RV32-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    slli a0, a0, 3
-; RV32-V128-NEXT:    add a0, sp, a0
-; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-V128-NEXT:    addi a0, sp, 16
-; RV32-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-V128-NEXT:    lui a0, %hi(.LCPI10_0)
 ; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI10_0)
 ; RV32-V128-NEXT:    li a1, 32
-; RV32-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; RV32-V128-NEXT:    vle32.v v24, (a0)
+; RV32-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-V128-NEXT:    vle32.v v0, (a0)
+; RV32-V128-NEXT:    vmv8r.v v24, v8
+; RV32-V128-NEXT:    addi a0, sp, 16
+; RV32-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT:    vrgather.vv v8, v24, v0
 ; RV32-V128-NEXT:    lui a0, %hi(.LCPI10_1)
 ; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; RV32-V128-NEXT:    vle32.v v16, (a0)
+; RV32-V128-NEXT:    vle32.v v24, (a0)
 ; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    slli a0, a0, 4
+; RV32-V128-NEXT:    slli a0, a0, 3
 ; RV32-V128-NEXT:    add a0, sp, a0
 ; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV32-V128-NEXT:    lui a0, 699051
 ; RV32-V128-NEXT:    addi a0, a0, -1366
-; RV32-V128-NEXT:    vmv.s.x v0, a0
-; RV32-V128-NEXT:    vrgather.vv v16, v8, v24
-; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    slli a0, a0, 4
-; RV32-V128-NEXT:    add a0, sp, a0
-; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-V128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v0, a0
+; RV32-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; RV32-V128-NEXT:    csrr a0, vlenb
 ; RV32-V128-NEXT:    slli a0, a0, 3
 ; RV32-V128-NEXT:    add a0, sp, a0
 ; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT:    vrgather.vv v16, v8, v24, v0.t
+; RV32-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-V128-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV32-V128-NEXT:    vmv.v.v v24, v8
 ; RV32-V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-V128-NEXT:    vmv4r.v v24, v8
 ; RV32-V128-NEXT:    addi a0, sp, 16
 ; RV32-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT:    vwaddu.vv v0, v8, v24
+; RV32-V128-NEXT:    vwaddu.vv v0, v8, v16
 ; RV32-V128-NEXT:    li a0, -1
-; RV32-V128-NEXT:    vwmaccu.vx v0, a0, v24
+; RV32-V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; RV32-V128-NEXT:    vmv8r.v v8, v0
+; RV32-V128-NEXT:    vmv8r.v v16, v24
 ; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    li a1, 24
-; RV32-V128-NEXT:    mul a0, a0, a1
+; RV32-V128-NEXT:    slli a0, a0, 4
 ; RV32-V128-NEXT:    add sp, sp, a0
 ; RV32-V128-NEXT:    addi sp, sp, 16
 ; RV32-V128-NEXT:    ret
@@ -314,56 +306,48 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; RV64-V128-NEXT:    addi sp, sp, -16
 ; RV64-V128-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    li a1, 24
-; RV64-V128-NEXT:    mul a0, a0, a1
+; RV64-V128-NEXT:    slli a0, a0, 4
 ; RV64-V128-NEXT:    sub sp, sp, a0
-; RV64-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    slli a0, a0, 3
-; RV64-V128-NEXT:    add a0, sp, a0
-; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-V128-NEXT:    addi a0, sp, 16
-; RV64-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-V128-NEXT:    lui a0, %hi(.LCPI10_0)
 ; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI10_0)
 ; RV64-V128-NEXT:    li a1, 32
-; RV64-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; RV64-V128-NEXT:    vle32.v v24, (a0)
+; RV64-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-V128-NEXT:    vle32.v v0, (a0)
+; RV64-V128-NEXT:    vmv8r.v v24, v8
+; RV64-V128-NEXT:    addi a0, sp, 16
+; RV64-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT:    vrgather.vv v8, v24, v0
 ; RV64-V128-NEXT:    lui a0, %hi(.LCPI10_1)
 ; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; RV64-V128-NEXT:    vle32.v v16, (a0)
+; RV64-V128-NEXT:    vle32.v v24, (a0)
 ; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    slli a0, a0, 4
+; RV64-V128-NEXT:    slli a0, a0, 3
 ; RV64-V128-NEXT:    add a0, sp, a0
 ; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV64-V128-NEXT:    lui a0, 699051
 ; RV64-V128-NEXT:    addiw a0, a0, -1366
-; RV64-V128-NEXT:    vmv.s.x v0, a0
-; RV64-V128-NEXT:    vrgather.vv v16, v8, v24
-; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    slli a0, a0, 4
-; RV64-V128-NEXT:    add a0, sp, a0
-; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-V128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-V128-NEXT:    vmv.v.x v0, a0
+; RV64-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; RV64-V128-NEXT:    csrr a0, vlenb
 ; RV64-V128-NEXT:    slli a0, a0, 3
 ; RV64-V128-NEXT:    add a0, sp, a0
 ; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT:    vrgather.vv v16, v8, v24, v0.t
+; RV64-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-V128-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV64-V128-NEXT:    vmv.v.v v24, v8
 ; RV64-V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-V128-NEXT:    vmv4r.v v24, v8
 ; RV64-V128-NEXT:    addi a0, sp, 16
 ; RV64-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT:    vwaddu.vv v0, v8, v24
+; RV64-V128-NEXT:    vwaddu.vv v0, v8, v16
 ; RV64-V128-NEXT:    li a0, -1
-; RV64-V128-NEXT:    vwmaccu.vx v0, a0, v24
+; RV64-V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; RV64-V128-NEXT:    vmv8r.v v8, v0
+; RV64-V128-NEXT:    vmv8r.v v16, v24
 ; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    li a1, 24
-; RV64-V128-NEXT:    mul a0, a0, a1
+; RV64-V128-NEXT:    slli a0, a0, 4
 ; RV64-V128-NEXT:    add sp, sp, a0
 ; RV64-V128-NEXT:    addi sp, sp, 16
 ; RV64-V128-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 944dea21335e9..9dced0e5204f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -5,9 +5,9 @@
 define <4 x half> @shuffle_v4f16(<4 x half> %x, <4 x half> %y) {
 ; CHECK-LABEL: shuffle_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 11
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 11
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x half> %x, <4 x half> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
@@ -18,8 +18,9 @@ define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
 ; CHECK-LABEL: shuffle_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 236
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
@@ -27,49 +28,29 @@ define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
 }
 
 define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) {
-; RV32-LABEL: shuffle_fv_v4f64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 9
-; RV32-NEXT:    lui a1, %hi(.LCPI2_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI2_0)(a1)
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vfmerge.vfm v8, v8, fa5, v0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_fv_v4f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI2_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI2_0)(a0)
-; RV64-NEXT:    li a0, 9
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vfmerge.vfm v8, v8, fa5, v0
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_fv_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 9
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
   %s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   ret <4 x double> %s
 }
 
 define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) {
-; RV32-LABEL: shuffle_vf_v4f64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 6
-; RV32-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI3_0)(a1)
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vfmerge.vfm v8, v8, fa5, v0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_vf_v4f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
-; RV64-NEXT:    li a0, 6
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vfmerge.vfm v8, v8, fa5, v0
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_vf_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
   %s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   ret <4 x double> %s
 }
@@ -127,11 +108,12 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a0, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI6_0)
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle16.v v14, (a0)
-; RV32-NEXT:    li a0, 8
-; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    vrgatherei16.vv v12, v8, v14
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV32-NEXT:    vrgather.vi v12, v10, 1, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
@@ -140,11 +122,12 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a0, %hi(.LCPI6_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI6_0)
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v14, (a0)
-; RV64-NEXT:    li a0, 8
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vrgather.vv v12, v8, v14
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 8
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV64-NEXT:    vrgather.vi v12, v10, 1, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
@@ -155,14 +138,13 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 ; RV32-LABEL: vrgather_shuffle_xv_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 12
 ; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vid.v v12
 ; RV32-NEXT:    lui a0, %hi(.LCPI7_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI7_0)
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vid.v v12
 ; RV32-NEXT:    vrsub.vi v12, v12, 4
+; RV32-NEXT:    vmv.v.i v0, 12
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
@@ -170,14 +152,15 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 ;
 ; RV64-LABEL: vrgather_shuffle_xv_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    vrsub.vi v12, v10, 4
 ; RV64-NEXT:    lui a0, %hi(.LCPI7_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI7_0)
 ; RV64-NEXT:    vlse64.v v10, (a0), zero
-; RV64-NEXT:    li a0, 12
-; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vid.v v12
-; RV64-NEXT:    vrsub.vi v12, v12, 4
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 12
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV64-NEXT:    vrgather.vv v10, v8, v12, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret
@@ -195,7 +178,7 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_0)
 ; RV32-NEXT:    vlse64.v v10, (a1), zero
 ; RV32-NEXT:    vmul.vx v12, v12, a0
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.i v0, 3
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
@@ -203,14 +186,16 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
 ;
 ; RV64-LABEL: vrgather_shuffle_vx_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; RV64-NEXT:    vid.v v12
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    vmul.vx v12, v10, a0
 ; RV64-NEXT:    lui a0, %hi(.LCPI8_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI8_0)
 ; RV64-NEXT:    vlse64.v v10, (a0), zero
-; RV64-NEXT:    li a0, 3
-; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vmul.vx v12, v12, a0
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 3
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV64-NEXT:    vrgather.vv v10, v8, v12, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index a536c121898de..4e756448663ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -450,13 +450,13 @@ define void @buildvec_seq_v9i8(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 73
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 3
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    li a1, 146
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0
 ; CHECK-NEXT:    vsetivli zero, 9, e8, m1, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 7d7aa9c3b5493..defd3409c3e66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -55,10 +55,11 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV32-V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32-V128-NEXT:    vid.v v9
 ; RV32-V128-NEXT:    vsrl.vi v14, v9, 1
-; RV32-V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; RV32-V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV32-V128-NEXT:    vrgatherei16.vv v10, v8, v14
-; RV32-V128-NEXT:    li a0, 10
-; RV32-V128-NEXT:    vmv.s.x v0, a0
+; RV32-V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v0, 10
+; RV32-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV32-V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
 ; RV32-V128-NEXT:    vmv.v.v v8, v10
 ; RV32-V128-NEXT:    ret
@@ -66,12 +67,13 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV64-V128-LABEL: interleave_v2i64:
 ; RV64-V128:       # %bb.0:
 ; RV64-V128-NEXT:    vmv1r.v v12, v9
-; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-V128-NEXT:    vid.v v10
 ; RV64-V128-NEXT:    vsrl.vi v14, v10, 1
 ; RV64-V128-NEXT:    vrgather.vv v10, v8, v14
-; RV64-V128-NEXT:    li a0, 10
-; RV64-V128-NEXT:    vmv.s.x v0, a0
+; RV64-V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-V128-NEXT:    vmv.v.i v0, 10
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; RV64-V128-NEXT:    vrgather.vv v10, v12, v14, v0.t
 ; RV64-V128-NEXT:    vmv.v.v v8, v10
 ; RV64-V128-NEXT:    ret
@@ -82,8 +84,7 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV32-V512-NEXT:    vid.v v10
 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-V512-NEXT:    li a0, 10
-; RV32-V512-NEXT:    vmv.s.x v0, a0
+; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
 ; RV32-V512-NEXT:    vmv.v.v v8, v10
@@ -94,8 +95,7 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; RV64-V512-NEXT:    vid.v v10
 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV64-V512-NEXT:    li a0, 10
-; RV64-V512-NEXT:    vmv.s.x v0, a0
+; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-V512-NEXT:    vmv.v.v v8, v10
@@ -206,8 +206,7 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V128-NEXT:    vid.v v10
 ; V128-NEXT:    vsrl.vi v11, v10, 1
 ; V128-NEXT:    vrgather.vv v10, v8, v11
-; V128-NEXT:    li a0, 10
-; V128-NEXT:    vmv.s.x v0, a0
+; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vadd.vi v8, v11, 1
 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
@@ -219,8 +218,7 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V512-NEXT:    vid.v v10
 ; V512-NEXT:    vsrl.vi v11, v10, 1
 ; V512-NEXT:    vrgather.vv v10, v8, v11
-; V512-NEXT:    li a0, 10
-; V512-NEXT:    vmv.s.x v0, a0
+; V512-NEXT:    vmv.v.i v0, 10
 ; V512-NEXT:    vadd.vi v8, v11, 1
 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V512-NEXT:    vmv1r.v v8, v10
@@ -416,56 +414,48 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; RV32-V128-NEXT:    addi sp, sp, -16
 ; RV32-V128-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    li a1, 24
-; RV32-V128-NEXT:    mul a0, a0, a1
+; RV32-V128-NEXT:    slli a0, a0, 4
 ; RV32-V128-NEXT:    sub sp, sp, a0
-; RV32-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    slli a0, a0, 3
-; RV32-V128-NEXT:    add a0, sp, a0
-; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-V128-NEXT:    addi a0, sp, 16
-; RV32-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-V128-NEXT:    lui a0, %hi(.LCPI17_0)
 ; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI17_0)
 ; RV32-V128-NEXT:    li a1, 32
-; RV32-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; RV32-V128-NEXT:    vle32.v v24, (a0)
+; RV32-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-V128-NEXT:    vle32.v v0, (a0)
+; RV32-V128-NEXT:    vmv8r.v v24, v8
+; RV32-V128-NEXT:    addi a0, sp, 16
+; RV32-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT:    vrgather.vv v8, v24, v0
 ; RV32-V128-NEXT:    lui a0, %hi(.LCPI17_1)
 ; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI17_1)
-; RV32-V128-NEXT:    vle32.v v16, (a0)
+; RV32-V128-NEXT:    vle32.v v24, (a0)
 ; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    slli a0, a0, 4
+; RV32-V128-NEXT:    slli a0, a0, 3
 ; RV32-V128-NEXT:    add a0, sp, a0
 ; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV32-V128-NEXT:    lui a0, 699051
 ; RV32-V128-NEXT:    addi a0, a0, -1366
-; RV32-V128-NEXT:    vmv.s.x v0, a0
-; RV32-V128-NEXT:    vrgather.vv v16, v8, v24
-; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    slli a0, a0, 4
-; RV32-V128-NEXT:    add a0, sp, a0
-; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-V128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v0, a0
+; RV32-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; RV32-V128-NEXT:    csrr a0, vlenb
 ; RV32-V128-NEXT:    slli a0, a0, 3
 ; RV32-V128-NEXT:    add a0, sp, a0
 ; RV32-V128-NEXT:    addi a0, a0, 16
-; RV32-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT:    vrgather.vv v16, v8, v24, v0.t
+; RV32-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-V128-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV32-V128-NEXT:    vmv.v.v v24, v8
 ; RV32-V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-V128-NEXT:    vmv4r.v v24, v8
 ; RV32-V128-NEXT:    addi a0, sp, 16
 ; RV32-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT:    vwaddu.vv v0, v8, v24
+; RV32-V128-NEXT:    vwaddu.vv v0, v8, v16
 ; RV32-V128-NEXT:    li a0, -1
-; RV32-V128-NEXT:    vwmaccu.vx v0, a0, v24
+; RV32-V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; RV32-V128-NEXT:    vmv8r.v v8, v0
+; RV32-V128-NEXT:    vmv8r.v v16, v24
 ; RV32-V128-NEXT:    csrr a0, vlenb
-; RV32-V128-NEXT:    li a1, 24
-; RV32-V128-NEXT:    mul a0, a0, a1
+; RV32-V128-NEXT:    slli a0, a0, 4
 ; RV32-V128-NEXT:    add sp, sp, a0
 ; RV32-V128-NEXT:    addi sp, sp, 16
 ; RV32-V128-NEXT:    ret
@@ -475,56 +465,48 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; RV64-V128-NEXT:    addi sp, sp, -16
 ; RV64-V128-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    li a1, 24
-; RV64-V128-NEXT:    mul a0, a0, a1
+; RV64-V128-NEXT:    slli a0, a0, 4
 ; RV64-V128-NEXT:    sub sp, sp, a0
-; RV64-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    slli a0, a0, 3
-; RV64-V128-NEXT:    add a0, sp, a0
-; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-V128-NEXT:    addi a0, sp, 16
-; RV64-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-V128-NEXT:    lui a0, %hi(.LCPI17_0)
 ; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI17_0)
 ; RV64-V128-NEXT:    li a1, 32
-; RV64-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; RV64-V128-NEXT:    vle32.v v24, (a0)
+; RV64-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-V128-NEXT:    vle32.v v0, (a0)
+; RV64-V128-NEXT:    vmv8r.v v24, v8
+; RV64-V128-NEXT:    addi a0, sp, 16
+; RV64-V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT:    vrgather.vv v8, v24, v0
 ; RV64-V128-NEXT:    lui a0, %hi(.LCPI17_1)
 ; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI17_1)
-; RV64-V128-NEXT:    vle32.v v16, (a0)
+; RV64-V128-NEXT:    vle32.v v24, (a0)
 ; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    slli a0, a0, 4
+; RV64-V128-NEXT:    slli a0, a0, 3
 ; RV64-V128-NEXT:    add a0, sp, a0
 ; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV64-V128-NEXT:    lui a0, 699051
 ; RV64-V128-NEXT:    addiw a0, a0, -1366
-; RV64-V128-NEXT:    vmv.s.x v0, a0
-; RV64-V128-NEXT:    vrgather.vv v16, v8, v24
-; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    slli a0, a0, 4
-; RV64-V128-NEXT:    add a0, sp, a0
-; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-V128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-V128-NEXT:    vmv.v.x v0, a0
+; RV64-V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; RV64-V128-NEXT:    csrr a0, vlenb
 ; RV64-V128-NEXT:    slli a0, a0, 3
 ; RV64-V128-NEXT:    add a0, sp, a0
 ; RV64-V128-NEXT:    addi a0, a0, 16
-; RV64-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT:    vrgather.vv v16, v8, v24, v0.t
+; RV64-V128-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-V128-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV64-V128-NEXT:    vmv.v.v v24, v8
 ; RV64-V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-V128-NEXT:    vmv4r.v v24, v8
 ; RV64-V128-NEXT:    addi a0, sp, 16
 ; RV64-V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT:    vwaddu.vv v0, v8, v24
+; RV64-V128-NEXT:    vwaddu.vv v0, v8, v16
 ; RV64-V128-NEXT:    li a0, -1
-; RV64-V128-NEXT:    vwmaccu.vx v0, a0, v24
+; RV64-V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; RV64-V128-NEXT:    vmv8r.v v8, v0
+; RV64-V128-NEXT:    vmv8r.v v16, v24
 ; RV64-V128-NEXT:    csrr a0, vlenb
-; RV64-V128-NEXT:    li a1, 24
-; RV64-V128-NEXT:    mul a0, a0, a1
+; RV64-V128-NEXT:    slli a0, a0, 4
 ; RV64-V128-NEXT:    add sp, sp, a0
 ; RV64-V128-NEXT:    addi sp, sp, 16
 ; RV64-V128-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index e9fce03485aac..dd3f39939cae8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -5,9 +5,9 @@
 define <4 x i16> @shuffle_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: shuffle_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 11
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 11
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
@@ -18,8 +18,9 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: shuffle_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 203
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
@@ -29,9 +30,9 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %x, <8 x i32> %y) {
 define <4 x i16> @shuffle_xv_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: shuffle_xv_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 9
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 9
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 5, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i16> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
@@ -41,9 +42,9 @@ define <4 x i16> @shuffle_xv_v4i16(<4 x i16> %x) {
 define <4 x i16> @shuffle_vx_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: shuffle_vx_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 6
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 6
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 5, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
@@ -85,8 +86,7 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI6_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v11, (a0)
-; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -98,11 +98,10 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: vrgather_shuffle_xv_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vi v10, v9, 4
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vmv.v.i v9, 5
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -118,7 +117,7 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 3
 ; CHECK-NEXT:    vmul.vx v10, v9, a0
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vmv.v.i v9, 5
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -180,32 +179,36 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 5
-; RV32-NEXT:    vmv.v.i v20, 2
-; RV32-NEXT:    vslideup.vi v20, v16, 7
 ; RV32-NEXT:    lui a0, %hi(.LCPI11_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI11_0)
-; RV32-NEXT:    vle16.v v21, (a0)
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV32-NEXT:    vle16.v v20, (a0)
+; RV32-NEXT:    vmv.v.i v21, 2
+; RV32-NEXT:    vslideup.vi v21, v16, 7
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v20
 ; RV32-NEXT:    li a0, 164
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v21
-; RV32-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v16, v12, v21, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 5
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vmv.s.x v16, a0
 ; RV64-NEXT:    vmv.v.i v20, 2
 ; RV64-NEXT:    lui a0, %hi(.LCPI11_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI11_0)
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vslideup.vi v20, v16, 7
-; RV64-NEXT:    li a0, 164
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vrgather.vv v16, v8, v24
+; RV64-NEXT:    li a0, 164
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a0
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vv v16, v12, v20, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
@@ -218,27 +221,31 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_0)
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle16.v v16, (a0)
 ; RV32-NEXT:    vmv.v.i v20, -1
+; RV32-NEXT:    vrgatherei16.vv v12, v20, v16
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_1)
-; RV32-NEXT:    vle16.v v17, (a0)
+; RV32-NEXT:    vle16.v v16, (a0)
 ; RV32-NEXT:    li a0, 113
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vrgatherei16.vv v12, v20, v16
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v17, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_xv_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI12_0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v16, (a0)
 ; RV64-NEXT:    li a0, 113
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a0
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vmv.v.i v12, -1
 ; RV64-NEXT:    vrgather.vv v12, v8, v16, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
@@ -252,14 +259,16 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a0, %hi(.LCPI13_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_0)
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle16.v v16, (a0)
 ; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
 ; RV32-NEXT:    lui a0, %hi(.LCPI13_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_1)
 ; RV32-NEXT:    vle16.v v8, (a0)
 ; RV32-NEXT:    li a0, 140
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV32-NEXT:    vmv.v.i v16, 5
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
@@ -267,12 +276,14 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
 ;
 ; RV64-LABEL: vrgather_shuffle_vx_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    lui a0, %hi(.LCPI13_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v16, (a0)
 ; RV64-NEXT:    li a0, 115
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a0
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vmv.v.i v12, 5
 ; RV64-NEXT:    vrgather.vv v12, v8, v16, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
@@ -377,9 +388,9 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) {
 define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 66
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    li a0, 66
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -398,7 +409,7 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vmv.s.x v11, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
 ; CHECK-NEXT:    li a0, 66
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -410,12 +421,12 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i0we4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 67
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 4
+; CHECK-NEXT:    li a0, 67
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -433,7 +444,7 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; RV32-NEXT:    vmv.v.x v11, a0
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    li a0, 66
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vrgather.vv v10, v8, v11
 ; RV32-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v10
@@ -447,7 +458,7 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; RV64-NEXT:    vmv.v.x v11, a0
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    li a0, 66
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vrgather.vv v10, v8, v11
 ; RV64-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; RV64-NEXT:    vmv1r.v v8, v10
@@ -464,9 +475,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 2
-; CHECK-NEXT:    li a0, 70
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    li a0, 70
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -489,7 +500,7 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    li a0, 98
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vrgather.vv v10, v8, v12
 ; RV32-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v10
@@ -508,7 +519,7 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; RV64-NEXT:    vmv.v.x v12, a0
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    li a0, 98
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vrgather.vv v10, v8, v12
 ; RV64-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-NEXT:    vmv1r.v v8, v10
@@ -661,7 +672,7 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 224
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -4
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -677,7 +688,7 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    li a0, 144
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -4
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -724,7 +735,7 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    li a0, 195
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -742,7 +753,7 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
 ; CHECK-NEXT:    vadd.vi v12, v11, 2
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    li a0, 234
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -762,7 +773,7 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    li a0, 234
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vrgather.vv v10, v9, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
index 05c99f04b7d14..c8c2aea4b4ebb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
@@ -229,9 +229,8 @@ define void @splat_v4i64(ptr %x, i64 %y) {
 ;
 ; LMULMAX1-RV32-LABEL: splat_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    li a3, 5
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v8, a2
 ; LMULMAX1-RV32-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
@@ -822,10 +821,12 @@ define void @vadd_vx_v16i64(ptr %a, i64 %b, ptr %c) {
 ; LMULMAX2-RV32-NEXT:    vle64.v v12, (a0)
 ; LMULMAX2-RV32-NEXT:    addi a0, a0, 32
 ; LMULMAX2-RV32-NEXT:    vle64.v v14, (a0)
-; LMULMAX2-RV32-NEXT:    li a0, 85
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v16, a2
+; LMULMAX2-RV32-NEXT:    li a0, 85
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a0
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmerge.vxm v16, v16, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vadd.vv v14, v14, v16
@@ -859,9 +860,8 @@ define void @vadd_vx_v16i64(ptr %a, i64 %b, ptr %c) {
 ; LMULMAX1-RV32-NEXT:    vle64.v v14, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a0, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v15, (a0)
-; LMULMAX1-RV32-NEXT:    li a0, 5
-; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v16, a2
 ; LMULMAX1-RV32-NEXT:    vmerge.vxm v16, v16, a1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index c660e7f8ff65e..be32acbaff9c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1105,45 +1105,45 @@ define void @mulhu_v16i8(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a0)
-; RV32-NEXT:    li a1, 513
+; RV32-NEXT:    lui a1, 3
+; RV32-NEXT:    addi a1, a1, -2044
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.v.x v0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 4
-; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    li a1, -128
+; RV32-NEXT:    vmerge.vxm v10, v9, a1, v0
 ; RV32-NEXT:    lui a1, 1
-; RV32-NEXT:    addi a2, a1, 78
+; RV32-NEXT:    addi a2, a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vmv.v.x v0, a2
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT:    vmerge.vim v9, v9, 3, v0
-; RV32-NEXT:    lui a2, 8
-; RV32-NEXT:    addi a2, a2, 304
+; RV32-NEXT:    lui a2, %hi(.LCPI65_0)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI65_0)
+; RV32-NEXT:    vle8.v v11, (a2)
+; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vsrl.vv v9, v8, v9
+; RV32-NEXT:    vmulhu.vv v9, v9, v11
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vmulhu.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    li a2, 513
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vmv.v.x v0, a2
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT:    vmerge.vim v9, v9, 2, v0
-; RV32-NEXT:    lui a2, 3
-; RV32-NEXT:    addi a2, a2, -2044
+; RV32-NEXT:    vmv.v.i v9, 4
+; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    addi a1, a1, 78
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vmv.v.x v0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    li a2, -128
-; RV32-NEXT:    vmerge.vxm v11, v10, a2, v0
-; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vmerge.vim v9, v9, 3, v0
+; RV32-NEXT:    lui a1, 8
+; RV32-NEXT:    addi a1, a1, 304
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.v.x v0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT:    lui a1, %hi(.LCPI65_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI65_0)
-; RV32-NEXT:    vle8.v v12, (a1)
-; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmulhu.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vmulhu.vv v8, v8, v11
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmerge.vim v9, v9, 2, v0
 ; RV32-NEXT:    vsrl.vv v8, v8, v9
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    ret
@@ -1152,45 +1152,45 @@ define void @mulhu_v16i8(ptr %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a0)
-; RV64-NEXT:    li a1, 513
+; RV64-NEXT:    lui a1, 3
+; RV64-NEXT:    addiw a1, a1, -2044
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v9, 4
-; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV64-NEXT:    vmv.v.i v9, 0
+; RV64-NEXT:    li a1, -128
+; RV64-NEXT:    vmerge.vxm v10, v9, a1, v0
 ; RV64-NEXT:    lui a1, 1
-; RV64-NEXT:    addiw a2, a1, 78
+; RV64-NEXT:    addiw a2, a1, 32
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a2
+; RV64-NEXT:    vmv.v.x v0, a2
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT:    vmerge.vim v9, v9, 3, v0
-; RV64-NEXT:    lui a2, 8
-; RV64-NEXT:    addiw a2, a2, 304
+; RV64-NEXT:    lui a2, %hi(.LCPI65_0)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI65_0)
+; RV64-NEXT:    vle8.v v11, (a2)
+; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV64-NEXT:    vsrl.vv v9, v8, v9
+; RV64-NEXT:    vmulhu.vv v9, v9, v11
+; RV64-NEXT:    vsub.vv v8, v8, v9
+; RV64-NEXT:    vmulhu.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v8, v9
+; RV64-NEXT:    li a2, 513
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a2
+; RV64-NEXT:    vmv.v.x v0, a2
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT:    vmerge.vim v9, v9, 2, v0
-; RV64-NEXT:    lui a2, 3
-; RV64-NEXT:    addiw a2, a2, -2044
+; RV64-NEXT:    vmv.v.i v9, 4
+; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV64-NEXT:    addiw a1, a1, 78
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a2
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    li a2, -128
-; RV64-NEXT:    vmerge.vxm v11, v10, a2, v0
-; RV64-NEXT:    addiw a1, a1, 32
+; RV64-NEXT:    vmerge.vim v9, v9, 3, v0
+; RV64-NEXT:    lui a1, 8
+; RV64-NEXT:    addiw a1, a1, 304
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT:    lui a1, %hi(.LCPI65_0)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI65_0)
-; RV64-NEXT:    vle8.v v12, (a1)
-; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsrl.vv v10, v8, v10
-; RV64-NEXT:    vmulhu.vv v10, v10, v12
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vmulhu.vv v8, v8, v11
-; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vmerge.vim v9, v9, 2, v0
 ; RV64-NEXT:    vsrl.vv v8, v8, v9
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
@@ -1205,31 +1205,30 @@ define void @mulhu_v8i16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    li a1, 33
+; CHECK-NEXT:    vmv.v.x v0, a1
+; CHECK-NEXT:    vmv.v.i v9, 3
+; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
+; CHECK-NEXT:    vmv.v.i v10, 1
+; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v9, v10, 6
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 1
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv.s.x v12, a1
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v9, v11, 6
+; CHECK-NEXT:    vslideup.vi v11, v10, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    lui a1, %hi(.LCPI66_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI66_0)
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-NEXT:    vmulhu.vv v9, v9, v12
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vmulhu.vv v8, v8, v10
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    li a1, 33
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
-; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v9, v11, 6
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vsrl.vv v11, v8, v11
+; CHECK-NEXT:    vmulhu.vv v10, v11, v10
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -1349,18 +1348,18 @@ define void @mulhs_v16i8(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a0)
+; RV32-NEXT:    li a1, -123
+; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    lui a1, 5
 ; RV32-NEXT:    addi a1, a1, -1452
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.v.x v0, a1
+; RV32-NEXT:    li a1, 57
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; RV32-NEXT:    vmulhu.vv v8, v8, v9
 ; RV32-NEXT:    vmv.v.i v9, 7
 ; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV32-NEXT:    li a1, -123
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 57
-; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
-; RV32-NEXT:    vmulhu.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vv v8, v8, v9
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    ret
@@ -1369,18 +1368,18 @@ define void @mulhs_v16i8(ptr %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a0)
+; RV64-NEXT:    li a1, -123
+; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    lui a1, 5
 ; RV64-NEXT:    addiw a1, a1, -1452
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vmv.v.x v0, a1
+; RV64-NEXT:    li a1, 57
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; RV64-NEXT:    vmulhu.vv v8, v8, v9
 ; RV64-NEXT:    vmv.v.i v9, 7
 ; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV64-NEXT:    li a1, -123
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    li a1, 57
-; RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
-; RV64-NEXT:    vmulhu.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vv v8, v8, v9
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
@@ -1395,11 +1394,11 @@ define void @mulhs_v8i16(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a0)
-; RV32-NEXT:    li a1, 105
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 5
 ; RV32-NEXT:    addi a1, a1, -1755
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 105
+; RV32-NEXT:    vmv.v.x v0, a1
 ; RV32-NEXT:    lui a1, 1048571
 ; RV32-NEXT:    addi a1, a1, 1755
 ; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -1414,11 +1413,11 @@ define void @mulhs_v8i16(ptr %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64-NEXT:    vle16.v v8, (a0)
-; RV64-NEXT:    li a1, 105
-; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    lui a1, 5
 ; RV64-NEXT:    addiw a1, a1, -1755
 ; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    li a1, 105
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    lui a1, 1048571
 ; RV64-NEXT:    addiw a1, a1, 1755
 ; RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -1439,25 +1438,24 @@ define void @mulhs_v6i16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, -7
+; CHECK-NEXT:    vmerge.vim v9, v9, 7, v0
+; CHECK-NEXT:    vdiv.vv v9, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 7
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vmv.v.i v10, 7
+; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    li a1, -14
-; CHECK-NEXT:    vmadd.vx v10, a1, v9
+; CHECK-NEXT:    vmadd.vx v11, a1, v10
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vdiv.vv v9, v9, v10
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, -7
-; CHECK-NEXT:    vmerge.vim v10, v10, 7, v0
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v11
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vslideup.vi v9, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vse16.v v9, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x i16>, ptr %x
   %b = sdiv <6 x i16> %a, <i16 -7, i16 7, i16 7, i16 -7, i16 7, i16 -7>
@@ -1470,11 +1468,10 @@ define void @mulhs_v4i32(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a1, 5
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 419430
 ; RV32-NEXT:    addi a1, a1, 1639
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    lui a1, 629146
 ; RV32-NEXT:    addi a1, a1, -1639
 ; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -4955,45 +4952,45 @@ define void @mulhu_v32i8(ptr %x) {
 ; LMULMAX2-RV32-NEXT:    li a1, 32
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle8.v v8, (a0)
+; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32-NEXT:    lui a2, 163907
+; LMULMAX2-RV32-NEXT:    addi a2, a2, -2044
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
+; LMULMAX2-RV32-NEXT:    li a2, -128
+; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v12, v10, a2, v0
 ; LMULMAX2-RV32-NEXT:    lui a2, 66049
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    lui a2, %hi(.LCPI181_0)
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, %lo(.LCPI181_0)
-; LMULMAX2-RV32-NEXT:    vle8.v v10, (a2)
-; LMULMAX2-RV32-NEXT:    vmv.v.i v12, 0
-; LMULMAX2-RV32-NEXT:    vmerge.vim v14, v12, 1, v0
-; LMULMAX2-RV32-NEXT:    vsrl.vv v14, v8, v14
-; LMULMAX2-RV32-NEXT:    vmulhu.vv v10, v14, v10
+; LMULMAX2-RV32-NEXT:    vle8.v v14, (a2)
+; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 1, v0
+; LMULMAX2-RV32-NEXT:    vsrl.vv v10, v8, v10
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v10, v10, v14
 ; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a2, 163907
-; LMULMAX2-RV32-NEXT:    addi a2, a2, -2044
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-RV32-NEXT:    li a2, -128
-; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v12, v12, a2, v0
 ; LMULMAX2-RV32-NEXT:    vmulhu.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 4
 ; LMULMAX2-RV32-NEXT:    lui a2, 8208
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, 513
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 4
 ; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 1, v0
 ; LMULMAX2-RV32-NEXT:    lui a2, 66785
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, 78
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 3, v0
 ; LMULMAX2-RV32-NEXT:    lui a2, 529160
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, 304
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 2, v0
 ; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v8, v10
@@ -5005,45 +5002,45 @@ define void @mulhu_v32i8(ptr %x) {
 ; LMULMAX2-RV64-NEXT:    li a1, 32
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vle8.v v8, (a0)
+; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV64-NEXT:    lui a2, 163907
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, -2044
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a2
+; LMULMAX2-RV64-NEXT:    li a2, -128
+; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v12, v10, a2, v0
 ; LMULMAX2-RV64-NEXT:    lui a2, 66049
 ; LMULMAX2-RV64-NEXT:    addiw a2, a2, 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI181_0)
 ; LMULMAX2-RV64-NEXT:    addi a2, a2, %lo(.LCPI181_0)
-; LMULMAX2-RV64-NEXT:    vle8.v v10, (a2)
-; LMULMAX2-RV64-NEXT:    vmv.v.i v12, 0
-; LMULMAX2-RV64-NEXT:    vmerge.vim v14, v12, 1, v0
-; LMULMAX2-RV64-NEXT:    vsrl.vv v14, v8, v14
-; LMULMAX2-RV64-NEXT:    vmulhu.vv v10, v14, v10
+; LMULMAX2-RV64-NEXT:    vle8.v v14, (a2)
+; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 1, v0
+; LMULMAX2-RV64-NEXT:    vsrl.vv v10, v8, v10
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v10, v10, v14
 ; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    lui a2, 163907
-; LMULMAX2-RV64-NEXT:    addiw a2, a2, -2044
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-RV64-NEXT:    li a2, -128
-; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmerge.vxm v12, v12, a2, v0
 ; LMULMAX2-RV64-NEXT:    vmulhu.vv v8, v8, v12
 ; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 4
 ; LMULMAX2-RV64-NEXT:    lui a2, 8208
 ; LMULMAX2-RV64-NEXT:    addiw a2, a2, 513
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 4
 ; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 1, v0
 ; LMULMAX2-RV64-NEXT:    lui a2, 66785
 ; LMULMAX2-RV64-NEXT:    addiw a2, a2, 78
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 3, v0
 ; LMULMAX2-RV64-NEXT:    lui a2, 529160
 ; LMULMAX2-RV64-NEXT:    addiw a2, a2, 304
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 2, v0
 ; LMULMAX2-RV64-NEXT:    vsrl.vv v8, v8, v10
@@ -5075,32 +5072,38 @@ define void @mulhu_v16i16(ptr %x) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle16.v v10, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, 2
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 289
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
-; LMULMAX2-RV32-NEXT:    vmv.v.i v12, 3
+; LMULMAX2-RV32-NEXT:    li a1, 257
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.i v12, 0
+; LMULMAX2-RV32-NEXT:    lui a1, 1048568
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v14, v12, a1, v0
 ; LMULMAX2-RV32-NEXT:    lui a1, 4
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 64
-; LMULMAX2-RV32-NEXT:    vmv.s.x v8, a1
-; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 2, v0
-; LMULMAX2-RV32-NEXT:    vmv1r.v v0, v8
-; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 1, v0
-; LMULMAX2-RV32-NEXT:    li a1, 257
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
-; LMULMAX2-RV32-NEXT:    vmv.v.i v14, 0
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI182_0)
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI182_0)
 ; LMULMAX2-RV32-NEXT:    vle16.v v16, (a1)
-; LMULMAX2-RV32-NEXT:    lui a1, 1048568
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v18, v14, a1, v0
 ; LMULMAX2-RV32-NEXT:    vmv1r.v v0, v8
-; LMULMAX2-RV32-NEXT:    vmerge.vim v8, v14, 1, v0
+; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 1, v0
+; LMULMAX2-RV32-NEXT:    vsrl.vv v12, v10, v12
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT:    vsub.vv v10, v10, v12
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v10, v10, v14
+; LMULMAX2-RV32-NEXT:    vadd.vv v10, v10, v12
+; LMULMAX2-RV32-NEXT:    lui a1, 2
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 289
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.i v12, 3
+; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 2, v0
+; LMULMAX2-RV32-NEXT:    vmv1r.v v0, v8
+; LMULMAX2-RV32-NEXT:    vmerge.vim v8, v12, 1, v0
 ; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vmulhu.vv v8, v8, v16
-; LMULMAX2-RV32-NEXT:    vsub.vv v10, v10, v8
-; LMULMAX2-RV32-NEXT:    vmulhu.vv v10, v10, v18
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vse16.v v8, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -5108,32 +5111,38 @@ define void @mulhu_v16i16(ptr %x) {
 ; LMULMAX2-RV64:       # %bb.0:
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vle16.v v10, (a0)
-; LMULMAX2-RV64-NEXT:    lui a1, 2
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 289
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
-; LMULMAX2-RV64-NEXT:    vmv.v.i v12, 3
+; LMULMAX2-RV64-NEXT:    li a1, 257
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.i v12, 0
+; LMULMAX2-RV64-NEXT:    lui a1, 1048568
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v14, v12, a1, v0
 ; LMULMAX2-RV64-NEXT:    lui a1, 4
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 64
-; LMULMAX2-RV64-NEXT:    vmv.s.x v8, a1
-; LMULMAX2-RV64-NEXT:    vmerge.vim v12, v12, 2, v0
-; LMULMAX2-RV64-NEXT:    vmv1r.v v0, v8
-; LMULMAX2-RV64-NEXT:    vmerge.vim v12, v12, 1, v0
-; LMULMAX2-RV64-NEXT:    li a1, 257
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
-; LMULMAX2-RV64-NEXT:    vmv.v.i v14, 0
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI182_0)
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, %lo(.LCPI182_0)
 ; LMULMAX2-RV64-NEXT:    vle16.v v16, (a1)
-; LMULMAX2-RV64-NEXT:    lui a1, 1048568
-; LMULMAX2-RV64-NEXT:    vmerge.vxm v18, v14, a1, v0
 ; LMULMAX2-RV64-NEXT:    vmv1r.v v0, v8
-; LMULMAX2-RV64-NEXT:    vmerge.vim v8, v14, 1, v0
+; LMULMAX2-RV64-NEXT:    vmerge.vim v12, v12, 1, v0
+; LMULMAX2-RV64-NEXT:    vsrl.vv v12, v10, v12
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v12, v12, v16
+; LMULMAX2-RV64-NEXT:    vsub.vv v10, v10, v12
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v10, v10, v14
+; LMULMAX2-RV64-NEXT:    vadd.vv v10, v10, v12
+; LMULMAX2-RV64-NEXT:    lui a1, 2
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, 289
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.i v12, 3
+; LMULMAX2-RV64-NEXT:    vmerge.vim v12, v12, 2, v0
+; LMULMAX2-RV64-NEXT:    vmv1r.v v0, v8
+; LMULMAX2-RV64-NEXT:    vmerge.vim v8, v12, 1, v0
 ; LMULMAX2-RV64-NEXT:    vsrl.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT:    vmulhu.vv v8, v8, v16
-; LMULMAX2-RV64-NEXT:    vsub.vv v10, v10, v8
-; LMULMAX2-RV64-NEXT:    vmulhu.vv v10, v10, v18
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT:    vsrl.vv v8, v8, v12
 ; LMULMAX2-RV64-NEXT:    vse16.v v8, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;
@@ -5163,7 +5172,9 @@ define void @mulhu_v8i32(ptr %x) {
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-NEXT:    vle32.v v8, (a0)
 ; LMULMAX2-NEXT:    li a1, 68
-; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-NEXT:    vmv.v.x v0, a1
+; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-NEXT:    lui a1, %hi(.LCPI183_0)
 ; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI183_0)
 ; LMULMAX2-NEXT:    vle32.v v10, (a1)
@@ -5175,7 +5186,9 @@ define void @mulhu_v8i32(ptr %x) {
 ; LMULMAX2-NEXT:    vmulhu.vv v8, v8, v12
 ; LMULMAX2-NEXT:    vadd.vv v8, v8, v10
 ; LMULMAX2-NEXT:    li a1, 136
-; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-NEXT:    vmv.v.x v0, a1
+; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-NEXT:    vmv.v.i v10, 2
 ; LMULMAX2-NEXT:    vmerge.vim v10, v10, 1, v0
 ; LMULMAX2-NEXT:    vsrl.vv v8, v8, v10
@@ -5368,18 +5381,18 @@ define void @mulhs_v32i8(ptr %x) {
 ; LMULMAX2-RV32-NEXT:    li a1, 32
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle8.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a2, -123
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a2
+; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 7
 ; LMULMAX2-RV32-NEXT:    lui a2, 304453
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, -1452
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-RV32-NEXT:    li a2, 57
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a2, v0
-; LMULMAX2-RV32-NEXT:    vmulhu.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 7
 ; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 1, v0
+; LMULMAX2-RV32-NEXT:    li a1, -123
+; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
+; LMULMAX2-RV32-NEXT:    li a1, 57
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v12, v12, a1, v0
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vse8.v v8, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
@@ -5389,18 +5402,18 @@ define void @mulhs_v32i8(ptr %x) {
 ; LMULMAX2-RV64-NEXT:    li a1, 32
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vle8.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    li a2, -123
-; LMULMAX2-RV64-NEXT:    vmv.v.x v10, a2
+; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 7
 ; LMULMAX2-RV64-NEXT:    lui a2, 304453
 ; LMULMAX2-RV64-NEXT:    addiw a2, a2, -1452
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-RV64-NEXT:    li a2, 57
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vmerge.vxm v10, v10, a2, v0
-; LMULMAX2-RV64-NEXT:    vmulhu.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 7
 ; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 1, v0
+; LMULMAX2-RV64-NEXT:    li a1, -123
+; LMULMAX2-RV64-NEXT:    vmv.v.x v12, a1
+; LMULMAX2-RV64-NEXT:    li a1, 57
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v12, v12, a1, v0
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v8, v8, v12
 ; LMULMAX2-RV64-NEXT:    vsrl.vv v8, v8, v10
 ; LMULMAX2-RV64-NEXT:    vse8.v v8, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
@@ -5414,7 +5427,7 @@ define void @mulhs_v32i8(ptr %x) {
 ; LMULMAX1-RV32-NEXT:    lui a2, 5
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -1452
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; LMULMAX1-RV32-NEXT:    vmv.v.i v10, -9
 ; LMULMAX1-RV32-NEXT:    vmerge.vim v10, v10, 9, v0
@@ -5433,7 +5446,7 @@ define void @mulhs_v32i8(ptr %x) {
 ; LMULMAX1-RV64-NEXT:    lui a2, 5
 ; LMULMAX1-RV64-NEXT:    addiw a2, a2, -1452
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; LMULMAX1-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV64-NEXT:    vmv.v.x v0, a2
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; LMULMAX1-RV64-NEXT:    vmv.v.i v10, -9
 ; LMULMAX1-RV64-NEXT:    vmerge.vim v10, v10, 9, v0
@@ -5453,14 +5466,16 @@ define void @mulhs_v16i16(ptr %x) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle16.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, 7
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -1687
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
 ; LMULMAX2-RV32-NEXT:    lui a1, 5
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -1755
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32-NEXT:    lui a1, 7
+; LMULMAX2-RV32-NEXT:    addi a1, a1, -1687
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV32-NEXT:    lui a1, 1048571
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1755
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV32-NEXT:    vmulh.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vsra.vi v8, v8, 1
@@ -5473,14 +5488,16 @@ define void @mulhs_v16i16(ptr %x) {
 ; LMULMAX2-RV64:       # %bb.0:
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vle16.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    lui a1, 7
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1687
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
 ; LMULMAX2-RV64-NEXT:    lui a1, 5
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1755
 ; LMULMAX2-RV64-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV64-NEXT:    lui a1, 7
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1687
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV64-NEXT:    lui a1, 1048571
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1755
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV64-NEXT:    vmulh.vv v8, v8, v10
 ; LMULMAX2-RV64-NEXT:    vsra.vi v8, v8, 1
@@ -5496,7 +5513,7 @@ define void @mulhs_v16i16(ptr %x) {
 ; LMULMAX1-NEXT:    addi a1, a0, 16
 ; LMULMAX1-NEXT:    vle16.v v9, (a1)
 ; LMULMAX1-NEXT:    li a2, 105
-; LMULMAX1-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-NEXT:    vmv.v.x v0, a2
 ; LMULMAX1-NEXT:    vmv.v.i v10, 7
 ; LMULMAX1-NEXT:    vmerge.vim v10, v10, -7, v0
 ; LMULMAX1-NEXT:    vdiv.vv v9, v9, v10
@@ -5515,13 +5532,15 @@ define void @mulhs_v8i32(ptr %x) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 85
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
 ; LMULMAX2-RV32-NEXT:    lui a1, 419430
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1639
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32-NEXT:    li a1, 85
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV32-NEXT:    lui a1, 629146
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -1639
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV32-NEXT:    vmulh.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 31
@@ -5552,11 +5571,10 @@ define void @mulhs_v8i32(ptr %x) {
 ; LMULMAX1-RV32-NEXT:    vle32.v v8, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle32.v v9, (a1)
-; LMULMAX1-RV32-NEXT:    li a2, 5
-; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a2
 ; LMULMAX1-RV32-NEXT:    lui a2, 419430
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 1639
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.i v0, 5
 ; LMULMAX1-RV32-NEXT:    lui a2, 629146
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -1639
 ; LMULMAX1-RV32-NEXT:    vmerge.vxm v10, v10, a2, v0
@@ -5600,18 +5618,21 @@ define void @mulhs_v4i64(ptr %x) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 17
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a2, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a2
+; LMULMAX2-RV32-NEXT:    li a2, 17
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a2
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1366
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmulh.vv v10, v8, v10
 ; LMULMAX2-RV32-NEXT:    li a1, 51
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmv.v.i v12, -1
 ; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 0, v0
@@ -5620,7 +5641,8 @@ define void @mulhs_v4i64(ptr %x) {
 ; LMULMAX2-RV32-NEXT:    li a1, 63
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v12, a1
 ; LMULMAX2-RV32-NEXT:    li a1, 68
-; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV32-NEXT:    vmv.v.x v0, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vmv.v.i v10, 0
 ; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 1, v0
@@ -5634,26 +5656,27 @@ define void @mulhs_v4i64(ptr %x) {
 ; LMULMAX2-RV64:       # %bb.0:
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    li a1, 5
-; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.i v0, 5
+; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 1
+; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 0, v0
 ; LMULMAX2-RV64-NEXT:    lui a1, 349525
 ; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
+; LMULMAX2-RV64-NEXT:    slli a2, a1, 32
+; LMULMAX2-RV64-NEXT:    add a1, a1, a2
 ; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI188_0)
 ; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI188_0)(a2)
-; LMULMAX2-RV64-NEXT:    slli a3, a1, 32
-; LMULMAX2-RV64-NEXT:    add a1, a1, a3
-; LMULMAX2-RV64-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV64-NEXT:    vmerge.vxm v10, v10, a2, v0
-; LMULMAX2-RV64-NEXT:    vmulh.vv v10, v8, v10
 ; LMULMAX2-RV64-NEXT:    vmv.v.i v12, -1
 ; LMULMAX2-RV64-NEXT:    vmerge.vim v12, v12, 0, v0
-; LMULMAX2-RV64-NEXT:    vmadd.vv v12, v8, v10
+; LMULMAX2-RV64-NEXT:    vmv.v.x v14, a1
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v14, v14, a2, v0
+; LMULMAX2-RV64-NEXT:    vmulh.vv v14, v8, v14
+; LMULMAX2-RV64-NEXT:    vmacc.vv v14, v8, v12
+; LMULMAX2-RV64-NEXT:    vsra.vv v8, v14, v10
 ; LMULMAX2-RV64-NEXT:    li a1, 63
-; LMULMAX2-RV64-NEXT:    vsrl.vx v8, v12, a1
-; LMULMAX2-RV64-NEXT:    vmv.v.i v10, 1
-; LMULMAX2-RV64-NEXT:    vmerge.vim v10, v10, 0, v0
-; LMULMAX2-RV64-NEXT:    vsra.vv v10, v12, v10
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV64-NEXT:    vsrl.vx v10, v14, a1
+; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v10
 ; LMULMAX2-RV64-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV64-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 9f2c3e6023dce..a4c2fa02d55bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -16,10 +16,10 @@ define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
 ; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vadd.vv v9, v8, v8
 ; CHECK-NEXT:    vrgather.vv v8, v10, v9
-; CHECK-NEXT:    li a0, 4
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v12, v10, 4
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 0, v0.t
 ; CHECK-NEXT:    vadd.vi v11, v9, 1
@@ -132,16 +132,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 82
+; RV32-NEXT:    li a3, 78
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xce, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 78 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 57
+; RV32-NEXT:    li a4, 53
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -156,150 +156,122 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vadd.vi v8, v24, -4
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a4, a3, 4
-; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    li a4, 13
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs4r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vrgather.vv v12, v16, v8
+; RV32-NEXT:    vrgather.vv v4, v16, v8
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 45
+; RV32-NEXT:    li a4, 41
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vadd.vi v8, v24, -10
 ; RV32-NEXT:    lui a3, 12
-; RV32-NEXT:    vmv.s.x v0, a3
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a4, a3, 5
-; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    li a4, 29
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a4, a3, 6
-; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    li a4, 69
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgather.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vrgather.vv v4, v16, v8, v0.t
+; RV32-NEXT:    lui a3, %hi(.LCPI6_0)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_0)
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vle32.v v24, (a3)
+; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 41
+; RV32-NEXT:    li a4, 61
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    lui a5, %hi(.LCPI6_1)
-; RV32-NEXT:    addi a5, a5, %lo(.LCPI6_1)
-; RV32-NEXT:    lui a6, 1
-; RV32-NEXT:    vle32.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a7, a4, 3
-; RV32-NEXT:    add a4, a7, a4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a1, 128
+; RV32-NEXT:    vrgather.vv v8, v16, v24
+; RV32-NEXT:    lui a3, %hi(.LCPI6_1)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_1)
+; RV32-NEXT:    lui a4, 1
+; RV32-NEXT:    addi a4, a4, -64
+; RV32-NEXT:    vle32.v v16, (a3)
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a5, 21
+; RV32-NEXT:    mul a3, a3, a5
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 73
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v8, (a5)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 25
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v8, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a6, -64
-; RV32-NEXT:    vmv.s.x v24, a1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
+; RV32-NEXT:    slli a3, a1, 4
 ; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v24, v16, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v2, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v8, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vi v16, v12, -2
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vi v8, v24, -2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v24, v16
-; RV32-NEXT:    vadd.vi v16, v12, -8
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v4, v16, v8
+; RV32-NEXT:    vadd.vi v8, v24, -8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv1r.v v1, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 69
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
-; RV32-NEXT:    vmv.v.v v4, v8
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v4, v16, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
@@ -308,37 +280,42 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    vle32.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 61
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v24, v8, v16
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v24
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
@@ -347,12 +324,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v12, v16, v8
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v12, v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
@@ -361,104 +338,87 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vi v8, v8, -6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 69
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vmv.v.v v4, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vle32.v v8, (a3)
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 61
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgather.vv v8, v24, v16
+; RV32-NEXT:    lui a1, %hi(.LCPI6_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_6)
+; RV32-NEXT:    li a3, 960
+; RV32-NEXT:    vle32.v v24, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a3
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v12, v8
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgather.vv v12, v24, v8
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 69
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgather.vv v12, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
@@ -467,132 +427,130 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_9)
-; RV32-NEXT:    vle32.v v24, (a1)
+; RV32-NEXT:    vle32.v v0, (a1)
 ; RV32-NEXT:    vle32.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 61
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v0, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
-; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    lui a1, 15
-; RV32-NEXT:    vmv.s.x v1, a1
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v4, v16, v12
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vrgather.vv v4, v8, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
+; RV32-NEXT:    lui a3, 15
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v2, a3
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 69
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgather.vv v4, v16, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
-; RV32-NEXT:    vle32.v v24, (a1)
-; RV32-NEXT:    vle32.v v8, (a3)
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 61
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    li a1, 1008
-; RV32-NEXT:    vmv.s.x v2, a1
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v24, v16
+; RV32-NEXT:    lui a1, %hi(.LCPI6_12)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_12)
+; RV32-NEXT:    li a3, 1008
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v16, v24
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v8
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 69
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v12, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -602,38 +560,43 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a2, %hi(.LCPI6_15)
 ; RV32-NEXT:    addi a2, a2, %lo(.LCPI6_15)
-; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v24, (a1)
 ; RV32-NEXT:    vle32.v v8, (a2)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 6
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 69
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 73
+; RV32-NEXT:    li a2, 61
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v24, v16
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 49
+; RV32-NEXT:    li a2, 53
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 6
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 69
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    li a2, 41
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -643,16 +606,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v12, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v4, (a1)
-; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 5
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a2, a2, a3
@@ -660,23 +621,31 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a3, a2, 2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a3, a2, 4
+; RV32-NEXT:    add a2, a3, a2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 41
+; RV32-NEXT:    li a2, 21
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 82
+; RV32-NEXT:    li a1, 78
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
@@ -691,72 +660,81 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd6, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 86 * vlenb
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
-; RV64-NEXT:    addi a2, a1, 128
-; RV64-NEXT:    vle64.v v16, (a2)
+; RV64-NEXT:    addi a2, a1, 256
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v8, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 61
+; RV64-NEXT:    li a3, 77
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vle64.v v0, (a1)
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a2, a1, 128
+; RV64-NEXT:    vle64.v v24, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 69
+; RV64-NEXT:    li a3, 53
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vle64.v v0, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 61
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    li a2, 6
-; RV64-NEXT:    vmul.vx v8, v8, a2
-; RV64-NEXT:    vrgather.vv v24, v0, v8
+; RV64-NEXT:    li a1, 6
+; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    vrgather.vv v16, v0, v8
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 77
+; RV64-NEXT:    li a3, 69
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vmv.s.x v0, a2
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 41
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vadd.vi v8, v8, -16
-; RV64-NEXT:    vrgather.vv v24, v16, v8, v0.t
-; RV64-NEXT:    addi a1, a1, 256
-; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 41
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 77
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v8, v24, 4
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgather.vi v8, v16, 4
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v16, 8
+; RV64-NEXT:    vslidedown.vi v24, v24, 8
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 2, v0.t
+; RV64-NEXT:    vrgather.vi v8, v24, 2, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 29
 ; RV64-NEXT:    mul a1, a1, a2
@@ -765,20 +743,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v8, v16, 1
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v8, v24, 1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 69
+; RV64-NEXT:    li a2, 61
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v24, v0, v8
-; RV64-NEXT:    vadd.vi v8, v16, -15
+; RV64-NEXT:    vrgather.vv v16, v0, v8
+; RV64-NEXT:    vadd.vi v8, v24, -15
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 41
 ; RV64-NEXT:    mul a1, a1, a2
@@ -786,20 +764,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 61
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v24, v16, v8, v0.t
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 77
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v8, v16, 5
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v8, v24, 5
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
@@ -811,10 +789,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v8, v16, 3, v0.t
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v8, v24, 3, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 25
 ; RV64-NEXT:    mul a1, a1, a2
@@ -832,54 +810,68 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vmv.s.x v12, zero
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 13
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vslideup.vi v8, v12, 5
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 77
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v12, v24, v8
-; RV64-NEXT:    vrgather.vi v12, v16, 4, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v12, v16, v8
+; RV64-NEXT:    vrgather.vi v12, v24, 4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 41
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v0, v8, 2
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v8, v0, 2
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 69
+; RV64-NEXT:    li a2, 61
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v24, v16, v0
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v16, v24, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vadd.vi v8, v0, -14
+; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vadd.vi v8, v8, -14
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 61
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v24, v16, v8, v0.t
+; RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 41
@@ -887,7 +879,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 41
 ; RV64-NEXT:    mul a1, a1, a2
@@ -898,28 +890,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    vmv.v.i v8, 7
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 21
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv.s.x v12, a1
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vslideup.vi v8, v12, 5
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 77
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v16, v24, v8
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v20, v16, v8
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
@@ -932,64 +924,64 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v16, v8, 5, v0.t
+; RV64-NEXT:    vrgather.vi v20, v8, 5, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v16, v0, 3
+; RV64-NEXT:    vadd.vi v24, v0, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 69
+; RV64-NEXT:    li a2, 61
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v8, v24, v16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v8, v16, v24
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vadd.vi v8, v0, -13
+; RV64-NEXT:    vadd.vi v24, v0, -13
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 61
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v12, v8
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 7, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 13
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
@@ -1000,20 +992,24 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vslideup.vi v16, v8, 6
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    li a1, 192
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 77
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v20, v8, 2
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v20, v8, 2
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
@@ -1027,82 +1023,76 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 77
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v16, v0, 4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v8, v24, v16
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v24, v0, 4
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    li a2, 61
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v8, v16, v24
 ; RV64-NEXT:    li a1, 28
-; RV64-NEXT:    vmv.s.x v16, a1
-; RV64-NEXT:    vadd.vi v8, v0, -12
-; RV64-NEXT:    vmv1r.v v0, v16
-; RV64-NEXT:    vmv1r.v v1, v16
+; RV64-NEXT:    vadd.vi v24, v0, -12
+; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 61
-; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a2, a1, 5
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v12, v8
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a2, a1, 5
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 7, e64, m4, tu, ma
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vslideup.vi v16, v8, 6
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 77
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v4, v8, 3
+; RV64-NEXT:    vrgather.vi v20, v8, 3
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
@@ -1112,42 +1102,81 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v4, v8, v16, v0.t
+; RV64-NEXT:    vrgather.vv v20, v8, v16, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v16, v8, 5
+; RV64-NEXT:    vadd.vi v24, v8, 5
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 61
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v8, v0, v24
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 69
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v8, v24, v16
+; RV64-NEXT:    vadd.vi v8, v24, -11
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vadd.vi v16, v16, -11
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 61
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 53
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 77
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v4, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v12, v16
 ; RV64-NEXT:    addi a1, a0, 320
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v4, (a1)
+; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a3, a2, 5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index f4cd2ea48d9d4..f78210ef4b3e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -100,16 +100,14 @@ define <2 x i1> @buildvec_mask_optsize_nonconst_v2i1(i1 %x, i1 %y) optsize {
 define <3 x i1> @buildvec_mask_v1i1() {
 ; CHECK-LABEL: buildvec_mask_v1i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_v1i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    li a0, 2
 ; ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 2
 ; ZVE32F-NEXT:    ret
   ret <3 x i1> <i1 0, i1 1, i1 0>
 }
@@ -117,16 +115,14 @@ define <3 x i1> @buildvec_mask_v1i1() {
 define <3 x i1> @buildvec_mask_optsize_v1i1() optsize {
 ; CHECK-LABEL: buildvec_mask_optsize_v1i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_optsize_v1i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    li a0, 2
 ; ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 2
 ; ZVE32F-NEXT:    ret
   ret <3 x i1> <i1 0, i1 1, i1 0>
 }
@@ -134,16 +130,14 @@ define <3 x i1> @buildvec_mask_optsize_v1i1() optsize {
 define <4 x i1> @buildvec_mask_v4i1() {
 ; CHECK-LABEL: buildvec_mask_v4i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 6
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 6
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_v4i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    li a0, 6
 ; ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 6
 ; ZVE32F-NEXT:    ret
   ret <4 x i1> <i1 0, i1 1, i1 1, i1 0>
 }
@@ -151,9 +145,8 @@ define <4 x i1> @buildvec_mask_v4i1() {
 define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: buildvec_mask_nonconst_v4i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 3
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -162,9 +155,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) {
 ;
 ; ZVE32F-LABEL: buildvec_mask_nonconst_v4i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    li a2, 3
 ; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a2
+; ZVE32F-NEXT:    vmv.v.i v0, 3
 ; ZVE32F-NEXT:    vmv.v.x v8, a1
 ; ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
@@ -243,14 +235,14 @@ define <8 x i1> @buildvec_mask_v8i1() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 182
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_v8i1:
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    li a0, 182
 ; ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.x v0, a0
 ; ZVE32F-NEXT:    ret
   ret <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>
 }
@@ -258,9 +250,9 @@ define <8 x i1> @buildvec_mask_v8i1() {
 define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: buildvec_mask_nonconst_v8i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 19
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    li a2, 19
+; CHECK-NEXT:    vmv.v.x v0, a2
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -269,9 +261,9 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) {
 ;
 ; ZVE32F-LABEL: buildvec_mask_nonconst_v8i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    li a2, 19
 ; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a2
+; ZVE32F-NEXT:    li a2, 19
+; ZVE32F-NEXT:    vmv.v.x v0, a2
 ; ZVE32F-NEXT:    vmv.v.x v8, a1
 ; ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
@@ -420,14 +412,14 @@ define <10 x i1> @buildvec_mask_v10i1() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 949
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_v10i1:
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    li a0, 949
 ; ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.x v0, a0
 ; ZVE32F-NEXT:    ret
   ret <10 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>
 }
@@ -438,7 +430,7 @@ define <16 x i1> @buildvec_mask_v16i1() {
 ; CHECK-RV32-NEXT:    lui a0, 11
 ; CHECK-RV32-NEXT:    addi a0, a0, 1718
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v0, a0
+; CHECK-RV32-NEXT:    vmv.v.x v0, a0
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: buildvec_mask_v16i1:
@@ -446,7 +438,7 @@ define <16 x i1> @buildvec_mask_v16i1() {
 ; CHECK-RV64-NEXT:    lui a0, 11
 ; CHECK-RV64-NEXT:    addiw a0, a0, 1718
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v0, a0
+; CHECK-RV64-NEXT:    vmv.v.x v0, a0
 ; CHECK-RV64-NEXT:    ret
   ret <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>
 }
@@ -456,14 +448,14 @@ define <16 x i1> @buildvec_mask_v16i1_undefs() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1722
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_v16i1_undefs:
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    li a0, 1722
 ; ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.x v0, a0
 ; ZVE32F-NEXT:    ret
   ret <16 x i1> <i1 undef, i1 1, i1 undef, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>
 }
@@ -473,20 +465,20 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV32-LMULMAX1:       # %bb.0:
 ; RV32-LMULMAX1-NEXT:    li a0, 1776
 ; RV32-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 11
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1718
-; RV32-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX1-NEXT:    ret
 ;
 ; RV64-LMULMAX1-LABEL: buildvec_mask_v32i1:
 ; RV64-LMULMAX1:       # %bb.0:
 ; RV64-LMULMAX1-NEXT:    li a0, 1776
 ; RV64-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 11
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1718
-; RV64-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX1-NEXT:    ret
 ;
 ; RV32-LMULMAX2-LABEL: buildvec_mask_v32i1:
@@ -494,7 +486,7 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV32-LMULMAX2-NEXT:    lui a0, 748384
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, 1776
 ; RV32-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX2-NEXT:    ret
 ;
 ; RV64-LMULMAX2-LABEL: buildvec_mask_v32i1:
@@ -502,7 +494,7 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV64-LMULMAX2-NEXT:    lui a0, 748384
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, 1776
 ; RV64-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX2-NEXT:    ret
 ;
 ; RV32-LMULMAX4-LABEL: buildvec_mask_v32i1:
@@ -510,7 +502,7 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV32-LMULMAX4-NEXT:    lui a0, 748384
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, 1776
 ; RV32-LMULMAX4-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-LMULMAX4-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX4-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX4-NEXT:    ret
 ;
 ; RV64-LMULMAX4-LABEL: buildvec_mask_v32i1:
@@ -518,7 +510,7 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV64-LMULMAX4-NEXT:    lui a0, 748384
 ; RV64-LMULMAX4-NEXT:    addiw a0, a0, 1776
 ; RV64-LMULMAX4-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-LMULMAX4-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX4-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX4-NEXT:    ret
 ;
 ; RV32-LMULMAX8-LABEL: buildvec_mask_v32i1:
@@ -526,7 +518,7 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV32-LMULMAX8-NEXT:    lui a0, 748384
 ; RV32-LMULMAX8-NEXT:    addi a0, a0, 1776
 ; RV32-LMULMAX8-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-LMULMAX8-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX8-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX8-NEXT:    ret
 ;
 ; RV64-LMULMAX8-LABEL: buildvec_mask_v32i1:
@@ -534,7 +526,7 @@ define <32 x i1> @buildvec_mask_v32i1() {
 ; RV64-LMULMAX8-NEXT:    lui a0, 748384
 ; RV64-LMULMAX8-NEXT:    addiw a0, a0, 1776
 ; RV64-LMULMAX8-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-LMULMAX8-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX8-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX8-NEXT:    ret
   ret <32 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>
 }
@@ -544,13 +536,13 @@ define <64 x i1> @buildvec_mask_v64i1() {
 ; RV32-LMULMAX1:       # %bb.0:
 ; RV32-LMULMAX1-NEXT:    li a0, 1776
 ; RV32-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 4
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX1-NEXT:    vmv.s.x v9, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v9, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 11
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1718
-; RV32-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; RV32-LMULMAX1-NEXT:    ret
 ;
@@ -558,13 +550,13 @@ define <64 x i1> @buildvec_mask_v64i1() {
 ; RV64-LMULMAX1:       # %bb.0:
 ; RV64-LMULMAX1-NEXT:    li a0, 1776
 ; RV64-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 4
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX1-NEXT:    vmv.s.x v9, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v9, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 11
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1718
-; RV64-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; RV64-LMULMAX1-NEXT:    ret
 ;
@@ -573,10 +565,10 @@ define <64 x i1> @buildvec_mask_v64i1() {
 ; RV32-LMULMAX2-NEXT:    lui a0, 748384
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, 1776
 ; RV32-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 748388
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX2-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX2-NEXT:    ret
 ;
 ; RV64-LMULMAX2-LABEL: buildvec_mask_v64i1:
@@ -584,10 +576,10 @@ define <64 x i1> @buildvec_mask_v64i1() {
 ; RV64-LMULMAX2-NEXT:    lui a0, 748384
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, 1776
 ; RV64-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 748388
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX2-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX2-NEXT:    ret
 ;
 ; RV32-LMULMAX4-LABEL: buildvec_mask_v64i1:
@@ -595,11 +587,11 @@ define <64 x i1> @buildvec_mask_v64i1() {
 ; RV32-LMULMAX4-NEXT:    lui a0, 748388
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, -1793
 ; RV32-LMULMAX4-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-LMULMAX4-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX4-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX4-NEXT:    lui a0, 748384
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, 1776
+; RV32-LMULMAX4-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
 ; RV32-LMULMAX4-NEXT:    vmv.s.x v0, a0
-; RV32-LMULMAX4-NEXT:    vslideup.vi v0, v8, 1
 ; RV32-LMULMAX4-NEXT:    ret
 ;
 ; RV64-LMULMAX4-LABEL: buildvec_mask_v64i1:
@@ -615,11 +607,11 @@ define <64 x i1> @buildvec_mask_v64i1() {
 ; RV32-LMULMAX8-NEXT:    lui a0, 748388
 ; RV32-LMULMAX8-NEXT:    addi a0, a0, -1793
 ; RV32-LMULMAX8-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-LMULMAX8-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX8-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX8-NEXT:    lui a0, 748384
 ; RV32-LMULMAX8-NEXT:    addi a0, a0, 1776
+; RV32-LMULMAX8-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
 ; RV32-LMULMAX8-NEXT:    vmv.s.x v0, a0
-; RV32-LMULMAX8-NEXT:    vslideup.vi v0, v8, 1
 ; RV32-LMULMAX8-NEXT:    ret
 ;
 ; RV64-LMULMAX8-LABEL: buildvec_mask_v64i1:
@@ -637,19 +629,19 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ; RV32-LMULMAX1:       # %bb.0:
 ; RV32-LMULMAX1-NEXT:    li a0, 1776
 ; RV32-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 11
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1718
-; RV32-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 8
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1718
-; RV32-LMULMAX1-NEXT:    vmv.s.x v12, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v12, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 4
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX1-NEXT:    vmv.s.x v9, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v9, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 14
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1722
-; RV32-LMULMAX1-NEXT:    vmv.s.x v14, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v14, a0
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v11, v0
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v13, v9
@@ -659,19 +651,19 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ; RV64-LMULMAX1:       # %bb.0:
 ; RV64-LMULMAX1-NEXT:    li a0, 1776
 ; RV64-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 11
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1718
-; RV64-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 8
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1718
-; RV64-LMULMAX1-NEXT:    vmv.s.x v12, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v12, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 4
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX1-NEXT:    vmv.s.x v9, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v9, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 14
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1722
-; RV64-LMULMAX1-NEXT:    vmv.s.x v14, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v14, a0
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v11, v0
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v13, v9
@@ -682,16 +674,16 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ; RV32-LMULMAX2-NEXT:    lui a0, 748384
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, 1776
 ; RV32-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 748388
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX2-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 551776
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, 1776
-; RV32-LMULMAX2-NEXT:    vmv.s.x v9, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v9, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 945060
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX2-NEXT:    vmv.s.x v10, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v10, a0
 ; RV32-LMULMAX2-NEXT:    ret
 ;
 ; RV64-LMULMAX2-LABEL: buildvec_mask_v128i1:
@@ -699,16 +691,16 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ; RV64-LMULMAX2-NEXT:    lui a0, 748384
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, 1776
 ; RV64-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 748388
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX2-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 551776
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, 1776
-; RV64-LMULMAX2-NEXT:    vmv.s.x v9, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v9, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 945060
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX2-NEXT:    vmv.s.x v10, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v10, a0
 ; RV64-LMULMAX2-NEXT:    ret
 ;
 ; RV32-LMULMAX4-LABEL: buildvec_mask_v128i1:
@@ -716,18 +708,19 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ; RV32-LMULMAX4-NEXT:    lui a0, 748388
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, -1793
 ; RV32-LMULMAX4-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-LMULMAX4-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX4-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX4-NEXT:    lui a0, 748384
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, 1776
+; RV32-LMULMAX4-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
 ; RV32-LMULMAX4-NEXT:    vmv.s.x v0, a0
-; RV32-LMULMAX4-NEXT:    vslideup.vi v0, v8, 1
 ; RV32-LMULMAX4-NEXT:    lui a0, 945060
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX4-NEXT:    vmv.s.x v9, a0
+; RV32-LMULMAX4-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-LMULMAX4-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX4-NEXT:    lui a0, 551776
 ; RV32-LMULMAX4-NEXT:    addi a0, a0, 1776
+; RV32-LMULMAX4-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
 ; RV32-LMULMAX4-NEXT:    vmv.s.x v8, a0
-; RV32-LMULMAX4-NEXT:    vslideup.vi v8, v9, 1
 ; RV32-LMULMAX4-NEXT:    ret
 ;
 ; RV64-LMULMAX4-LABEL: buildvec_mask_v128i1:
@@ -743,37 +736,31 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ;
 ; RV32-LMULMAX8-LABEL: buildvec_mask_v128i1:
 ; RV32-LMULMAX8:       # %bb.0:
-; RV32-LMULMAX8-NEXT:    lui a0, 748388
-; RV32-LMULMAX8-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX8-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV32-LMULMAX8-NEXT:    vmv.s.x v8, a0
-; RV32-LMULMAX8-NEXT:    lui a0, 748384
-; RV32-LMULMAX8-NEXT:    addi a0, a0, 1776
-; RV32-LMULMAX8-NEXT:    vmv.s.x v0, a0
-; RV32-LMULMAX8-NEXT:    vslideup.vi v0, v8, 1
-; RV32-LMULMAX8-NEXT:    lui a0, 551776
-; RV32-LMULMAX8-NEXT:    addi a0, a0, 1776
-; RV32-LMULMAX8-NEXT:    vmv.s.x v8, a0
-; RV32-LMULMAX8-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
-; RV32-LMULMAX8-NEXT:    vslideup.vi v0, v8, 2
-; RV32-LMULMAX8-NEXT:    lui a0, 945060
-; RV32-LMULMAX8-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX8-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX8-NEXT:    lui a0, %hi(.LCPI20_0)
+; RV32-LMULMAX8-NEXT:    addi a0, a0, %lo(.LCPI20_0)
 ; RV32-LMULMAX8-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-LMULMAX8-NEXT:    vslideup.vi v0, v8, 3
+; RV32-LMULMAX8-NEXT:    vle32.v v0, (a0)
 ; RV32-LMULMAX8-NEXT:    ret
 ;
 ; RV64-LMULMAX8-LABEL: buildvec_mask_v128i1:
 ; RV64-LMULMAX8:       # %bb.0:
 ; RV64-LMULMAX8-NEXT:    lui a0, %hi(.LCPI20_0)
-; RV64-LMULMAX8-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
-; RV64-LMULMAX8-NEXT:    lui a1, %hi(.LCPI20_1)
-; RV64-LMULMAX8-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
+; RV64-LMULMAX8-NEXT:    addi a0, a0, %lo(.LCPI20_0)
 ; RV64-LMULMAX8-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-LMULMAX8-NEXT:    vmv.s.x v8, a0
-; RV64-LMULMAX8-NEXT:    vmv.s.x v0, a1
-; RV64-LMULMAX8-NEXT:    vslideup.vi v0, v8, 1
+; RV64-LMULMAX8-NEXT:    vlse64.v v0, (a0), zero
+; RV64-LMULMAX8-NEXT:    lui a0, %hi(.LCPI20_1)
+; RV64-LMULMAX8-NEXT:    ld a0, %lo(.LCPI20_1)(a0)
+; RV64-LMULMAX8-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
+; RV64-LMULMAX8-NEXT:    vmv.s.x v0, a0
 ; RV64-LMULMAX8-NEXT:    ret
+;
+; ZVE32F-LABEL: buildvec_mask_v128i1:
+; ZVE32F:       # %bb.0:
+; ZVE32F-NEXT:    lui a0, %hi(.LCPI20_0)
+; ZVE32F-NEXT:    addi a0, a0, %lo(.LCPI20_0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v0, (a0)
+; ZVE32F-NEXT:    ret
   ret <128 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>
 }
 
@@ -782,19 +769,19 @@ define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; RV32-LMULMAX1:       # %bb.0:
 ; RV32-LMULMAX1-NEXT:    li a0, 1776
 ; RV32-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 11
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1718
-; RV32-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 8
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1718
-; RV32-LMULMAX1-NEXT:    vmv.s.x v12, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v12, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 4
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX1-NEXT:    vmv.s.x v9, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v9, a0
 ; RV32-LMULMAX1-NEXT:    lui a0, 14
 ; RV32-LMULMAX1-NEXT:    addi a0, a0, 1722
-; RV32-LMULMAX1-NEXT:    vmv.s.x v14, a0
+; RV32-LMULMAX1-NEXT:    vmv.v.x v14, a0
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v11, v0
 ; RV32-LMULMAX1-NEXT:    vmv1r.v v13, v9
@@ -804,19 +791,19 @@ define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; RV64-LMULMAX1:       # %bb.0:
 ; RV64-LMULMAX1-NEXT:    li a0, 1776
 ; RV64-LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-LMULMAX1-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 11
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1718
-; RV64-LMULMAX1-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 8
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1718
-; RV64-LMULMAX1-NEXT:    vmv.s.x v12, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v12, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 4
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX1-NEXT:    vmv.s.x v9, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v9, a0
 ; RV64-LMULMAX1-NEXT:    lui a0, 14
 ; RV64-LMULMAX1-NEXT:    addiw a0, a0, 1722
-; RV64-LMULMAX1-NEXT:    vmv.s.x v14, a0
+; RV64-LMULMAX1-NEXT:    vmv.v.x v14, a0
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v11, v0
 ; RV64-LMULMAX1-NEXT:    vmv1r.v v13, v9
@@ -827,16 +814,16 @@ define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; RV32-LMULMAX2-NEXT:    lui a0, 748384
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, 1776
 ; RV32-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 748388
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX2-NEXT:    vmv.s.x v8, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v8, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 551776
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, 1776
-; RV32-LMULMAX2-NEXT:    vmv.s.x v9, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v9, a0
 ; RV32-LMULMAX2-NEXT:    lui a0, 945060
 ; RV32-LMULMAX2-NEXT:    addi a0, a0, -1793
-; RV32-LMULMAX2-NEXT:    vmv.s.x v10, a0
+; RV32-LMULMAX2-NEXT:    vmv.v.x v10, a0
 ; RV32-LMULMAX2-NEXT:    ret
 ;
 ; RV64-LMULMAX2-LABEL: buildvec_mask_optsize_v128i1:
@@ -844,16 +831,16 @@ define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; RV64-LMULMAX2-NEXT:    lui a0, 748384
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, 1776
 ; RV64-LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-LMULMAX2-NEXT:    vmv.s.x v0, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v0, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 748388
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX2-NEXT:    vmv.s.x v8, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v8, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 551776
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, 1776
-; RV64-LMULMAX2-NEXT:    vmv.s.x v9, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v9, a0
 ; RV64-LMULMAX2-NEXT:    lui a0, 945060
 ; RV64-LMULMAX2-NEXT:    addiw a0, a0, -1793
-; RV64-LMULMAX2-NEXT:    vmv.s.x v10, a0
+; RV64-LMULMAX2-NEXT:    vmv.v.x v10, a0
 ; RV64-LMULMAX2-NEXT:    ret
 ;
 ; RV32-LMULMAX4-LABEL: buildvec_mask_optsize_v128i1:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 7fec57513ff3f..d5592104e3110 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -181,8 +181,7 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) {
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
 ; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; ZVE32F-NEXT:    li a0, 2
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 2
 ; ZVE32F-NEXT:    vrgather.vi v10, v8, 1
 ; ZVE32F-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; ZVE32F-NEXT:    vse32.v v10, (a1)
@@ -237,8 +236,7 @@ define void @vnsrl_32_float(ptr %in, ptr %out) {
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
 ; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; ZVE32F-NEXT:    li a0, 2
-; ZVE32F-NEXT:    vmv.s.x v0, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 2
 ; ZVE32F-NEXT:    vrgather.vi v10, v8, 1
 ; ZVE32F-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; ZVE32F-NEXT:    vse32.v v10, (a1)
@@ -282,8 +280,7 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) {
 ; V-NEXT:    vle64.v v8, (a0)
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; V-NEXT:    vslidedown.vi v9, v8, 2
-; V-NEXT:    li a0, 2
-; V-NEXT:    vmv.s.x v0, a0
+; V-NEXT:    vmv.v.i v0, 2
 ; V-NEXT:    vrgather.vi v10, v8, 1
 ; V-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; V-NEXT:    vse64.v v10, (a1)
@@ -335,8 +332,7 @@ define void @vnsrl_64_double(ptr %in, ptr %out) {
 ; V-NEXT:    vle64.v v8, (a0)
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; V-NEXT:    vslidedown.vi v9, v8, 2
-; V-NEXT:    li a0, 2
-; V-NEXT:    vmv.s.x v0, a0
+; V-NEXT:    vmv.v.i v0, 2
 ; V-NEXT:    vrgather.vi v10, v8, 1
 ; V-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; V-NEXT:    vse64.v v10, (a1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
index df8754cdb2340..ad6aeb0f70013 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
@@ -1296,10 +1296,12 @@ define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zero
 ; RV32-NEXT:    vmv1r.v v16, v0
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    lui a3, 341
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    lui a1, 341
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vxm v24, v24, a0, v0
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vmv1r.v v0, v16
@@ -1322,10 +1324,12 @@ define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    lui a3, 341
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 341
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vxm v16, v16, a0, v0
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index dfc980d0a099e..72c3d4b2e291f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -49,8 +49,7 @@ define <8 x i8> @v4i8_2(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    li a0, 15
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrsub.vi v8, v11, 3
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -79,11 +78,11 @@ define <16 x i8> @v8i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 15
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vrsub.vi v8, v11, 7
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; CHECK-NEXT:    vrsub.vi v8, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -119,7 +118,7 @@ define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; RV32-NEXT:    lui a0, 16
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; RV32-NEXT:    vrgather.vv v10, v14, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
@@ -139,7 +138,7 @@ define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; RV64-NEXT:    lui a0, 16
 ; RV64-NEXT:    addiw a0, a0, -1
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; RV64-NEXT:    vrgather.vv v10, v14, v8, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v10
@@ -195,8 +194,7 @@ define <8 x i16> @v4i16_2(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    li a0, 15
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrsub.vi v8, v11, 3
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -222,13 +220,15 @@ define <16 x i16> @v8i16_2(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: v8i16_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v12, v9
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v14
 ; CHECK-NEXT:    vrsub.vi v16, v14, 15
 ; CHECK-NEXT:    vrgather.vv v10, v8, v16
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrsub.vi v8, v14, 7
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v12, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -265,7 +265,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; RV32-NEXT:    lui a0, 16
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; RV32-NEXT:    vrgather.vv v8, v16, v12, v0.t
 ; RV32-NEXT:    ret
@@ -285,7 +285,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; RV64-NEXT:    lui a0, 16
 ; RV64-NEXT:    addiw a0, a0, -1
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; RV64-NEXT:    vrgather.vv v8, v16, v12, v0.t
 ; RV64-NEXT:    ret
@@ -337,13 +337,14 @@ define <8 x i32> @v4i32_2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: v4i32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v12, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vid.v v14
 ; CHECK-NEXT:    vrsub.vi v16, v14, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v16
-; CHECK-NEXT:    li a0, 15
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrsub.vi v8, v14, 3
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v12, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -368,13 +369,15 @@ define <16 x i32> @v8i32_2(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: v8i32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v16, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vid.v v20
 ; CHECK-NEXT:    vrsub.vi v24, v20, 15
 ; CHECK-NEXT:    vrgather.vv v12, v8, v24
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrsub.vi v8, v20, 7
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v12, v16, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -401,7 +404,7 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; RV32-NEXT:    lui a0, %hi(.LCPI23_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI23_0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v0, (a0)
 ; RV32-NEXT:    vmv4r.v v24, v12
 ; RV32-NEXT:    vmv4r.v v16, v8
@@ -410,7 +413,9 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; RV32-NEXT:    vrsub.vi v16, v16, 15
 ; RV32-NEXT:    lui a0, 16
 ; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a0
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -419,7 +424,7 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; RV64-NEXT:    lui a0, %hi(.LCPI23_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI23_0)
 ; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v0, (a0)
 ; RV64-NEXT:    vmv4r.v v24, v12
 ; RV64-NEXT:    vmv4r.v v16, v8
@@ -428,7 +433,9 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; RV64-NEXT:    vrsub.vi v16, v16, 15
 ; RV64-NEXT:    lui a0, 16
 ; RV64-NEXT:    addiw a0, a0, -1
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v0, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; RV64-NEXT:    ret
   %v32i32 = shufflevector <16 x i32> %a, <16 x i32> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -494,10 +501,9 @@ define <8 x i64> @v4i64_2(<4 x i64> %a, <4 x i64> %b) {
 ; RV32-NEXT:    vrsub.vi v19, v18, 7
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v12, v8, v19
-; RV32-NEXT:    li a0, 15
-; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vrsub.vi v8, v18, 3
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
@@ -506,13 +512,14 @@ define <8 x i64> @v4i64_2(<4 x i64> %a, <4 x i64> %b) {
 ; RV64-LABEL: v4i64_2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vmv2r.v v16, v10
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vid.v v20
 ; RV64-NEXT:    vrsub.vi v24, v20, 7
 ; RV64-NEXT:    vrgather.vv v12, v8, v24
-; RV64-NEXT:    li a0, 15
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vrsub.vi v8, v20, 3
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 15
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vv v12, v16, v8, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
@@ -567,8 +574,7 @@ define <8 x half> @v4f16_2(<4 x half> %a, <4 x half> %b) {
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrsub.vi v12, v11, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    li a0, 15
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vrsub.vi v8, v11, 3
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -594,13 +600,15 @@ define <16 x half> @v8f16_2(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: v8f16_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v12, v9
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v14
 ; CHECK-NEXT:    vrsub.vi v16, v14, 15
 ; CHECK-NEXT:    vrgather.vv v10, v8, v16
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrsub.vi v8, v14, 7
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v12, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -683,13 +691,14 @@ define <8 x float> @v4f32_2(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: v4f32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v12, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vid.v v14
 ; CHECK-NEXT:    vrsub.vi v16, v14, 7
 ; CHECK-NEXT:    vrgather.vv v10, v8, v16
-; CHECK-NEXT:    li a0, 15
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrsub.vi v8, v14, 3
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v12, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -714,13 +723,15 @@ define <16 x float> @v8f32_2(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: v8f32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v16, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vid.v v20
 ; CHECK-NEXT:    vrsub.vi v24, v20, 15
 ; CHECK-NEXT:    vrgather.vv v12, v8, v24
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrsub.vi v8, v20, 7
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v12, v16, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -787,10 +798,9 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) {
 ; RV32-NEXT:    vrsub.vi v19, v18, 7
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v12, v8, v19
-; RV32-NEXT:    li a0, 15
-; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vrsub.vi v8, v18, 3
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
@@ -799,13 +809,14 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) {
 ; RV64-LABEL: v4f64_2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vmv2r.v v16, v10
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vid.v v20
 ; RV64-NEXT:    vrsub.vi v24, v20, 7
 ; RV64-NEXT:    vrgather.vv v12, v8, v24
-; RV64-NEXT:    li a0, 15
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vrsub.vi v8, v20, 3
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 15
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vv v12, v16, v8, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 67744dd4a8dc3..8d353ddcb060d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -17,12 +17,12 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32-NEXT:    vadd.vi v12, v11, -16
 ; RV32-NEXT:    lui a0, 16
 ; RV32-NEXT:    addi a0, a0, -256
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmv.v.x v0, a0
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; RV32-NEXT:    vadd.vi v12, v11, -16
 ; RV32-NEXT:    vrgather.vv v9, v8, v12, v0.t
 ; RV32-NEXT:    vmsne.vi v9, v9, 0
 ; RV32-NEXT:    vadd.vi v12, v11, 1
@@ -45,12 +45,12 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64-NEXT:    vadd.vi v12, v11, -16
 ; RV64-NEXT:    lui a0, 16
 ; RV64-NEXT:    addiw a0, a0, -256
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vmv.v.x v0, a0
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; RV64-NEXT:    vadd.vi v12, v11, -16
 ; RV64-NEXT:    vrgather.vv v9, v8, v12, v0.t
 ; RV64-NEXT:    vmsne.vi v9, v9, 0
 ; RV64-NEXT:    vadd.vi v12, v11, 1
@@ -109,8 +109,8 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 2
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v9, v8, 1
 ; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
@@ -196,8 +196,8 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 2
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v9, v8, 1
 ; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 4374cd02699cd..ca71e83333fbf 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -662,7 +662,9 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32MV-NEXT:    vslidedown.vi v8, v8, 2
 ; RV32MV-NEXT:    li a0, 85
-; RV32MV-NEXT:    vmv.s.x v0, a0
+; RV32MV-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32MV-NEXT:    vmv.v.x v0, a0
+; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 1
 ; RV32MV-NEXT:    vmerge.vim v10, v10, -1, v0
 ; RV32MV-NEXT:    vand.vv v8, v8, v10


        


More information about the llvm-commits mailing list