[llvm] feff66a - [RISCV] Further optimize BUILD_VECTORs with repeated elements

Fraser Cormack via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 23 07:21:28 PDT 2021


Author: Fraser Cormack
Date: 2021-03-23T14:14:48Z
New Revision: feff66a0823144dd2c4a26e6008645786531356c

URL: https://github.com/llvm/llvm-project/commit/feff66a0823144dd2c4a26e6008645786531356c
DIFF: https://github.com/llvm/llvm-project/commit/feff66a0823144dd2c4a26e6008645786531356c.diff

LOG: [RISCV] Further optimize BUILD_VECTORs with repeated elements

This patch builds upon the initial BUILD_VECTOR work introduced in
D98700. It further optimizes the lowering of BUILD_VECTOR by using
VSELECT operations to effectively insert repeated elements into the
vector with relatively few instructions. This allows us to optimize more
BUILD_VECTORs without significantly increasing the size of the generated
code.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D98969

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
    llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 83b06d53c2c2a..caeffbb40fb7a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1248,14 +1248,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   // "insert" the upper element, and an insert of the lower element at position
   // 0, which improves codegen.
   SDValue DominantValue;
+  unsigned MostCommonCount = 0;
   DenseMap<SDValue, unsigned> ValueCounts;
-  // Use a fairly conservative threshold. A future optimization could be to use
-  // multiple vmerge.vi/vmerge.vx instructions on "partially-dominant"
-  // elements with more relaxed thresholds.
   unsigned NumUndefElts =
       count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
-  unsigned NumDefElts = NumElts - NumUndefElts;
-  unsigned DominantValueCountThreshold = NumDefElts <= 2 ? 0 : NumDefElts - 2;
 
   for (SDValue V : Op->op_values()) {
     if (V.isUndef())
@@ -1264,22 +1260,48 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     ValueCounts.insert(std::make_pair(V, 0));
     unsigned &Count = ValueCounts[V];
 
-    // Is this value dominant?
-    if (++Count > DominantValueCountThreshold)
+    // Is this value dominant? In case of a tie, prefer the highest element as
+    // it's cheaper to insert near the beginning of a vector than it is at the
+    // end.
+    if (++Count >= MostCommonCount) {
       DominantValue = V;
+      MostCommonCount = Count;
+    }
   }
 
+  assert(DominantValue && "Not expecting an all-undef BUILD_VECTOR");
+  MVT XLenVT = Subtarget.getXLenVT();
+  unsigned NumDefElts = NumElts - NumUndefElts;
+  unsigned DominantValueCountThreshold = NumDefElts <= 2 ? 0 : NumDefElts - 2;
+
   // Don't perform this optimization when optimizing for size, since
   // materializing elements and inserting them tends to cause code bloat.
-  if (DominantValue && !DAG.shouldOptForSize()) {
+  if (!DAG.shouldOptForSize() &&
+      ((MostCommonCount > DominantValueCountThreshold) ||
+       (ValueCounts.size() <= Log2_32(NumDefElts)))) {
+    // Start by splatting the most common element.
     SDValue Vec = DAG.getSplatBuildVector(VT, DL, DominantValue);
 
-    if (ValueCounts.size() != 1) {
-      MVT XLenVT = Subtarget.getXLenVT();
-      for (unsigned I = 0; I < NumElts; ++I) {
-        if (!Op.getOperand(I).isUndef() && Op.getOperand(I) != DominantValue)
-          Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec,
-                            Op.getOperand(I), DAG.getConstant(I, DL, XLenVT));
+    DenseSet<SDValue> Processed{DominantValue};
+    MVT SelMaskTy = VT.changeVectorElementType(MVT::i1);
+    for (const auto &OpIdx : enumerate(Op->ops())) {
+      const SDValue &V = OpIdx.value();
+      if (V.isUndef() || !Processed.insert(V).second)
+        continue;
+      if (ValueCounts[V] == 1) {
+        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V,
+                          DAG.getConstant(OpIdx.index(), DL, XLenVT));
+      } else {
+        // Blend in all instances of this value using a VSELECT, using a
+        // mask where each bit signals whether that element is the one
+        // we're after.
+        SmallVector<SDValue> Ops;
+        transform(Op->op_values(), std::back_inserter(Ops), [&](SDValue V1) {
+          return DAG.getConstant(V == V1, DL, XLenVT);
+        });
+        Vec = DAG.getNode(ISD::VSELECT, DL, VT,
+                          DAG.getBuildVector(SelMaskTy, DL, Ops),
+                          DAG.getSplatBuildVector(VT, DL, V), Vec);
       }
     }
 

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index c3ea32110c288..8a2b439d0186b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -368,138 +368,127 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: bitreverse_v2i64:
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_0)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_0)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v27, v25, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_1)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_1)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v29, v25, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_2)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_2)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v29, v29, v30
-; LMULMAX2-RV32-NEXT:    vor.vv v27, v29, v27
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_3)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_3)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v29, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v25, v29
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_4)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_4)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v31, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v30, v30, v31
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_5)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_5)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v31, (a1)
+; LMULMAX2-RV32-NEXT:    vle64.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 5
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v25, 0
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 24
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v27, v25, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v25, v31
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_6)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_6)
+; LMULMAX2-RV32-NEXT:    vsrl.vv v28, v26, v27
+; LMULMAX2-RV32-NEXT:    lui a1, 4080
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v9, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vor.vv v30, v8, v30
-; LMULMAX2-RV32-NEXT:    vor.vv v27, v30, v27
-; LMULMAX2-RV32-NEXT:    vsll.vv v30, v25, v31
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_7)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_7)
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v29, v25, a1, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v29
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v31, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v29, v25, 8, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v26, v29
+; LMULMAX2-RV32-NEXT:    lui a2, 1044480
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v31, v25, a2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vand.vv v30, v30, v31
-; LMULMAX2-RV32-NEXT:    vsll.vv v29, v25, v29
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_8)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_8)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v31, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v30, v28
+; LMULMAX2-RV32-NEXT:    addi a2, zero, 40
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v30, v25, a2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v31, v26, v30
+; LMULMAX2-RV32-NEXT:    lui a2, 16
+; LMULMAX2-RV32-NEXT:    addi a2, a2, -256
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v8, v25, a2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v31, v31, v8
+; LMULMAX2-RV32-NEXT:    addi a3, zero, 56
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v8, v25, a3, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v9, v26, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v31, v31, v9
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v28, v31
+; LMULMAX2-RV32-NEXT:    vsll.vv v29, v26, v29
+; LMULMAX2-RV32-NEXT:    addi a3, zero, 255
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v31, a3
+; LMULMAX2-RV32-NEXT:    vmerge.vim v31, v31, 0, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vand.vv v29, v29, v31
-; LMULMAX2-RV32-NEXT:    vor.vv v29, v29, v30
-; LMULMAX2-RV32-NEXT:    vsll.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    vsll.vv v25, v25, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_9)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_9)
+; LMULMAX2-RV32-NEXT:    vsll.vv v27, v26, v27
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v31, a2
+; LMULMAX2-RV32-NEXT:    vmerge.vim v31, v31, 0, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v27, v27, v31
+; LMULMAX2-RV32-NEXT:    vor.vv v27, v27, v29
+; LMULMAX2-RV32-NEXT:    vsll.vv v29, v26, v30
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
+; LMULMAX2-RV32-NEXT:    vmerge.vim v30, v30, 0, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v26, v25
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v29
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX2-RV32-NEXT:    vand.vv v29, v29, v30
+; LMULMAX2-RV32-NEXT:    vsll.vv v26, v26, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v29
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v27
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_10)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_10)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v27, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v27, v26, v27
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v25, 4, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsll.vv v26, v26, v27
+; LMULMAX2-RV32-NEXT:    vsll.vv v27, v27, v28
 ; LMULMAX2-RV32-NEXT:    lui a1, 986895
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v29, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX2-RV32-NEXT:    vsrl.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v29
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v27
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_11)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_11)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v27, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v27, v26, v27
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v25, 2, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsll.vv v26, v26, v27
+; LMULMAX2-RV32-NEXT:    vsll.vv v27, v27, v28
 ; LMULMAX2-RV32-NEXT:    lui a1, 838861
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v29, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX2-RV32-NEXT:    vsrl.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v29
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v27
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI2_12)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_12)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v27, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v27, v26, v27
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v25, v25, 1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsll.vv v26, v26, v27
+; LMULMAX2-RV32-NEXT:    vsll.vv v27, v27, v25
 ; LMULMAX2-RV32-NEXT:    lui a1, 699051
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX2-RV32-NEXT:    vsrl.vv v25, v25, v27
-; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vv v25, v26, v25
+; LMULMAX2-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX2-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -607,138 +596,127 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV32-LABEL: bitreverse_v2i64:
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_0)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_0)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v25, v26
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_1)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_1)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v28, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v29, v25, v28
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_2)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v30, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v30
-; LMULMAX1-RV32-NEXT:    vor.vv v27, v29, v27
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_3)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_3)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v29, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v30, v25, v29
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_4)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_4)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v31, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v31
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_5)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_5)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v31, (a1)
+; LMULMAX1-RV32-NEXT:    vle64.v v26, (a0)
+; LMULMAX1-RV32-NEXT:    addi a1, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.i v25, 0
+; LMULMAX1-RV32-NEXT:    addi a1, zero, 24
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v27, v25, a1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v8, v25, v31
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_6)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_6)
+; LMULMAX1-RV32-NEXT:    vsrl.vv v28, v26, v27
+; LMULMAX1-RV32-NEXT:    lui a1, 4080
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v9, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v30, v8, v30
-; LMULMAX1-RV32-NEXT:    vor.vv v27, v30, v27
-; LMULMAX1-RV32-NEXT:    vsll.vv v30, v25, v31
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_7)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_7)
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v29, v25, a1, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v29
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v31, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v29, v25, 8, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v30, v26, v29
+; LMULMAX1-RV32-NEXT:    lui a2, 1044480
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v31, v25, a2, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v31
-; LMULMAX1-RV32-NEXT:    vsll.vv v29, v25, v29
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_8)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_8)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v31, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vor.vv v28, v30, v28
+; LMULMAX1-RV32-NEXT:    addi a2, zero, 40
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v30, v25, a2, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v31, v26, v30
+; LMULMAX1-RV32-NEXT:    lui a2, 16
+; LMULMAX1-RV32-NEXT:    addi a2, a2, -256
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v8, v25, a2, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v31, v31, v8
+; LMULMAX1-RV32-NEXT:    addi a3, zero, 56
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v8, v25, a3, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v9, v26, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v31, v31, v9
+; LMULMAX1-RV32-NEXT:    vor.vv v28, v28, v31
+; LMULMAX1-RV32-NEXT:    vsll.vv v29, v26, v29
+; LMULMAX1-RV32-NEXT:    addi a3, zero, 255
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v31, a3
+; LMULMAX1-RV32-NEXT:    vmerge.vim v31, v31, 0, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v31
-; LMULMAX1-RV32-NEXT:    vor.vv v29, v29, v30
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    vsll.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_9)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_9)
+; LMULMAX1-RV32-NEXT:    vsll.vv v27, v26, v27
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v31, a2
+; LMULMAX1-RV32-NEXT:    vmerge.vim v31, v31, 0, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v27, v31
+; LMULMAX1-RV32-NEXT:    vor.vv v27, v27, v29
+; LMULMAX1-RV32-NEXT:    vsll.vv v29, v26, v30
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX1-RV32-NEXT:    vmv.v.x v30, a1
+; LMULMAX1-RV32-NEXT:    vmerge.vim v30, v30, 0, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v29
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v30
+; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v29
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v28
 ; LMULMAX1-RV32-NEXT:    lui a1, 61681
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_10)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_10)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v27, (a1)
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v26, v27
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v28, v25, 4, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsll.vv v27, v27, v28
 ; LMULMAX1-RV32-NEXT:    lui a1, 986895
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 240
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v29, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v29
+; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
 ; LMULMAX1-RV32-NEXT:    lui a1, 209715
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_11)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_11)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v27, (a1)
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v26, v27
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v28, v25, 2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsll.vv v27, v27, v28
 ; LMULMAX1-RV32-NEXT:    lui a1, 838861
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -820
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v29, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v29
+; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI2_12)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI2_12)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v27, (a1)
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v26, v27
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v25, v25, 1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsll.vv v27, v27, v25
 ; LMULMAX1-RV32-NEXT:    lui a1, 699051
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -1366
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v28
+; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
@@ -1310,138 +1288,127 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: bitreverse_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_0)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_0)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_1)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_1)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v10, v26, v30
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_2)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_2)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v12, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v10, v8
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_3)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_3)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v8, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v12, v26, v8
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_4)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_4)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v14, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_5)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_5)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v14, (a1)
+; LMULMAX2-RV32-NEXT:    vle64.v v28, (a0)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 85
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v26, 0
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 24
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v30, v26, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v16, v26, v14
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_6)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_6)
+; LMULMAX2-RV32-NEXT:    vsrl.vv v8, v28, v30
+; LMULMAX2-RV32-NEXT:    lui a1, 4080
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v18, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v18
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
-; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    vsll.vv v12, v26, v14
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_7)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_7)
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v26, a1, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v14, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v26, 8, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v12, v28, v10
+; LMULMAX2-RV32-NEXT:    lui a2, 1044480
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v14, v26, a2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vand.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT:    vsll.vv v8, v26, v8
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_8)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_8)
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v8
+; LMULMAX2-RV32-NEXT:    addi a2, zero, 40
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v8, v26, a2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v14, v28, v8
+; LMULMAX2-RV32-NEXT:    lui a2, 16
+; LMULMAX2-RV32-NEXT:    addi a2, a2, -256
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v16, v26, a2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v14, v14, v16
+; LMULMAX2-RV32-NEXT:    addi a3, zero, 56
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v16, v26, a3, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v18, v28, v16
+; LMULMAX2-RV32-NEXT:    vor.vv v14, v14, v18
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v14
+; LMULMAX2-RV32-NEXT:    vsll.vv v10, v28, v10
+; LMULMAX2-RV32-NEXT:    addi a3, zero, 255
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v14, a3
+; LMULMAX2-RV32-NEXT:    vmerge.vim v14, v14, 0, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v14
+; LMULMAX2-RV32-NEXT:    vsll.vv v30, v28, v30
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v14, a2
+; LMULMAX2-RV32-NEXT:    vmerge.vim v14, v14, 0, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vand.vv v30, v30, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v30, v30, v10
+; LMULMAX2-RV32-NEXT:    vsll.vv v8, v28, v8
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v14, (a1)
+; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32-NEXT:    vmerge.vim v10, v10, 0, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
-; LMULMAX2-RV32-NEXT:    vsll.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    vsll.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_9)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_9)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v28, v26
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v8
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v10
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32-NEXT:    vsll.vv v28, v28, v16
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v28, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v28, v12
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_10)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_10)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v30, v28, v30
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v8, v26, 4, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsll.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    vsll.vv v30, v30, v8
 ; LMULMAX2-RV32-NEXT:    lui a1, 986895
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 240
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v10
+; LMULMAX2-RV32-NEXT:    vsrl.vv v28, v28, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_11)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_11)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v30, v28, v30
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v8, v26, 2, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsll.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    vsll.vv v30, v30, v8
 ; LMULMAX2-RV32-NEXT:    lui a1, 838861
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -820
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v10
+; LMULMAX2-RV32-NEXT:    vsrl.vv v28, v28, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI5_12)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI5_12)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v30, v28, v30
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsll.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    vsll.vv v30, v30, v26
 ; LMULMAX2-RV32-NEXT:    lui a1, 699051
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -1366
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v8
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v28, v26
+; LMULMAX2-RV32-NEXT:    vor.vv v26, v26, v30
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -1551,176 +1518,165 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v11, (a1)
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_0)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_0)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v26, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v28, v11, v26
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_1)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_1)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v27, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v30, v11, v27
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_2)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_2)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v29, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v29
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v30, v28
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_3)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_3)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v28, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v31, v11, v28
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_4)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_4)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v30, (a2)
+; LMULMAX1-RV32-NEXT:    vle64.v v13, (a1)
+; LMULMAX1-RV32-NEXT:    addi a2, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.i v30, 0
+; LMULMAX1-RV32-NEXT:    addi a2, zero, 24
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v26, v30, a2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v31, v30
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_5)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_5)
+; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v13, v26
+; LMULMAX1-RV32-NEXT:    lui a2, 4080
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v31, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v12, v11, v31
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_6)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_6)
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v28, v30, a2, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v29, v27, v28
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v8, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v10, v9
-; LMULMAX1-RV32-NEXT:    vsll.vv v10, v11, v31
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_7)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_7)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v9, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v10, v9
-; LMULMAX1-RV32-NEXT:    vsll.vv v14, v11, v28
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_8)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_8)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v10, (a2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v14, v14, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v13, v14, v13
-; LMULMAX1-RV32-NEXT:    vsll.vv v14, v11, v26
-; LMULMAX1-RV32-NEXT:    vsll.vv v15, v11, v27
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_9)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_9)
+; LMULMAX1-RV32-NEXT:    vmerge.vim v27, v30, 8, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v8, v13, v27
+; LMULMAX1-RV32-NEXT:    lui a3, 1044480
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v31, v30, a3, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v31
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v8, v29
+; LMULMAX1-RV32-NEXT:    addi a3, zero, 40
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v29, v30, a3, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v8, v13, v29
+; LMULMAX1-RV32-NEXT:    lui a3, 16
+; LMULMAX1-RV32-NEXT:    addi a3, a3, -256
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v9, v30, a3, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v8, v9
+; LMULMAX1-RV32-NEXT:    addi a4, zero, 56
+; LMULMAX1-RV32-NEXT:    vsetivli a5, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v8, v30, a4, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v12, v13, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v14, v10, v11
+; LMULMAX1-RV32-NEXT:    vsll.vv v11, v13, v27
+; LMULMAX1-RV32-NEXT:    addi a4, zero, 255
+; LMULMAX1-RV32-NEXT:    vsetivli a5, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a4
+; LMULMAX1-RV32-NEXT:    vmerge.vim v10, v10, 0, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v12, v11, v10
+; LMULMAX1-RV32-NEXT:    vsll.vv v15, v13, v26
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v11, a3
+; LMULMAX1-RV32-NEXT:    vmerge.vim v11, v11, 0, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v15, v15, v12
+; LMULMAX1-RV32-NEXT:    vsll.vv v16, v13, v29
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v11, (a2)
+; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a2
+; LMULMAX1-RV32-NEXT:    vmerge.vim v12, v12, 0, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v14, v14, v15
-; LMULMAX1-RV32-NEXT:    vor.vv v13, v14, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v15, v13, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v16, v16, v12
+; LMULMAX1-RV32-NEXT:    vsll.vv v13, v13, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v13, v13, v16
+; LMULMAX1-RV32-NEXT:    vor.vv v13, v13, v15
+; LMULMAX1-RV32-NEXT:    vor.vv v16, v13, v14
 ; LMULMAX1-RV32-NEXT:    lui a2, 61681
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v14, v15, v12
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_10)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_10)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v13, (a2)
+; LMULMAX1-RV32-NEXT:    vand.vv v15, v16, v13
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v14, v30, 4, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsll.vv v16, v14, v13
+; LMULMAX1-RV32-NEXT:    vsll.vv v17, v15, v14
 ; LMULMAX1-RV32-NEXT:    lui a2, 986895
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 240
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v15, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v14
-; LMULMAX1-RV32-NEXT:    vsrl.vv v15, v15, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v17, v15, v16
+; LMULMAX1-RV32-NEXT:    vand.vv v16, v16, v15
+; LMULMAX1-RV32-NEXT:    vsrl.vv v16, v16, v14
+; LMULMAX1-RV32-NEXT:    vor.vv v18, v16, v17
 ; LMULMAX1-RV32-NEXT:    lui a2, 209715
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v15, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v16, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v18, v17, v15
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_11)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_11)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v16, (a2)
+; LMULMAX1-RV32-NEXT:    vand.vv v19, v18, v16
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v17, v30, 2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsll.vv v18, v18, v16
+; LMULMAX1-RV32-NEXT:    vsll.vv v19, v19, v17
 ; LMULMAX1-RV32-NEXT:    lui a2, 838861
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -820
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v19, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v20, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v17, v17, v19
-; LMULMAX1-RV32-NEXT:    vsrl.vv v17, v17, v16
-; LMULMAX1-RV32-NEXT:    vor.vv v17, v17, v18
+; LMULMAX1-RV32-NEXT:    vand.vv v18, v18, v20
+; LMULMAX1-RV32-NEXT:    vsrl.vv v18, v18, v17
+; LMULMAX1-RV32-NEXT:    vor.vv v18, v18, v19
 ; LMULMAX1-RV32-NEXT:    lui a2, 349525
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v18, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v19, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v20, v17, v18
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI5_12)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI5_12)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v21, (a2)
+; LMULMAX1-RV32-NEXT:    vand.vv v21, v18, v19
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v30, v30, 1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsll.vv v20, v20, v21
+; LMULMAX1-RV32-NEXT:    vsll.vv v21, v21, v30
 ; LMULMAX1-RV32-NEXT:    lui a2, 699051
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -1366
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vmv.v.x v22, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v17, v17, v22
-; LMULMAX1-RV32-NEXT:    vsrl.vv v17, v17, v21
-; LMULMAX1-RV32-NEXT:    vor.vv v17, v17, v20
-; LMULMAX1-RV32-NEXT:    vsrl.vv v20, v25, v26
-; LMULMAX1-RV32-NEXT:    vsrl.vv v23, v25, v27
-; LMULMAX1-RV32-NEXT:    vand.vv v29, v23, v29
-; LMULMAX1-RV32-NEXT:    vor.vv v29, v29, v20
-; LMULMAX1-RV32-NEXT:    vsrl.vv v20, v25, v28
-; LMULMAX1-RV32-NEXT:    vand.vv v30, v20, v30
-; LMULMAX1-RV32-NEXT:    vsrl.vv v20, v25, v31
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v20, v8
-; LMULMAX1-RV32-NEXT:    vor.vv v30, v8, v30
-; LMULMAX1-RV32-NEXT:    vor.vv v29, v30, v29
-; LMULMAX1-RV32-NEXT:    vsll.vv v30, v25, v31
-; LMULMAX1-RV32-NEXT:    vand.vv v30, v30, v9
-; LMULMAX1-RV32-NEXT:    vsll.vv v28, v25, v28
-; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v28, v28, v30
+; LMULMAX1-RV32-NEXT:    vand.vv v18, v18, v22
+; LMULMAX1-RV32-NEXT:    vsrl.vv v18, v18, v30
+; LMULMAX1-RV32-NEXT:    vor.vv v18, v18, v21
+; LMULMAX1-RV32-NEXT:    vsrl.vv v21, v25, v26
+; LMULMAX1-RV32-NEXT:    vand.vv v28, v21, v28
+; LMULMAX1-RV32-NEXT:    vsrl.vv v21, v25, v27
+; LMULMAX1-RV32-NEXT:    vand.vv v31, v21, v31
+; LMULMAX1-RV32-NEXT:    vor.vv v28, v31, v28
+; LMULMAX1-RV32-NEXT:    vsrl.vv v31, v25, v29
+; LMULMAX1-RV32-NEXT:    vand.vv v31, v31, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vv v9, v25, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v31, v31, v9
+; LMULMAX1-RV32-NEXT:    vor.vv v28, v28, v31
+; LMULMAX1-RV32-NEXT:    vsll.vv v27, v25, v27
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v27, v10
 ; LMULMAX1-RV32-NEXT:    vsll.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    vsll.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v26, v25
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsll.vv v27, v25, v29
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v27, v12
+; LMULMAX1-RV32-NEXT:    vsll.vv v25, v25, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v29
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v12
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v13
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v14
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v13
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v13
+; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v14
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v15
+; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v14
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v15
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v16
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v19
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v16
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v16
+; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v17
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v20
+; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v17
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v18
-; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v21
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v25, v19
+; LMULMAX1-RV32-NEXT:    vsll.vv v26, v26, v30
 ; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v22
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v21
+; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v30
 ; LMULMAX1-RV32-NEXT:    vor.vv v25, v25, v26
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v17, (a1)
+; LMULMAX1-RV32-NEXT:    vse64.v v18, (a1)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bitreverse_v4i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index a007f503a5ac7..09b81e0d2fa7d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -264,56 +264,53 @@ define void @ctpop_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_0)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 5
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v26, 0
+; LMULMAX2-RV32-NEXT:    vmerge.vim v27, v26, 1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v25, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vv v27, v25, v27
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v27
-; LMULMAX2-RV32-NEXT:    vsub.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vand.vv v27, v27, v28
+; LMULMAX2-RV32-NEXT:    vsub.vv v25, v25, v27
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v27, v26, 2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v27, v25, v27
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v27, v25, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI3_1)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_1)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v25, v25, v28
-; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v26
-; LMULMAX2-RV32-NEXT:    vadd.vv v25, v27, v25
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_2)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v27, v27, v28
+; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v28
+; LMULMAX2-RV32-NEXT:    vadd.vv v25, v25, v27
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v27, v26, 4, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v25, v26
-; LMULMAX2-RV32-NEXT:    vadd.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vsrl.vv v27, v25, v27
+; LMULMAX2-RV32-NEXT:    vadd.vv v25, v25, v27
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v26
+; LMULMAX2-RV32-NEXT:    vand.vv v25, v25, v27
 ; LMULMAX2-RV32-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vmul.vv v25, v25, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI3_3)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_3)
+; LMULMAX2-RV32-NEXT:    vmul.vv v25, v25, v27
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 56
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vsrl.vv v25, v25, v26
 ; LMULMAX2-RV32-NEXT:    vse64.v v25, (a0)
@@ -373,56 +370,53 @@ define void @ctpop_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_0)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.i v26, 0
+; LMULMAX1-RV32-NEXT:    vmerge.vim v27, v26, 1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v25, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v25, v27
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vsub.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v27, v28
+; LMULMAX1-RV32-NEXT:    vsub.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v27, v26, 2, v0
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v25, v27
 ; LMULMAX1-RV32-NEXT:    lui a1, 209715
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v27, v25, v26
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI3_1)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_1)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v28, (a1)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vadd.vv v25, v27, v25
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_2)
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX1-RV32-NEXT:    vand.vv v27, v27, v28
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v28
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v27, v26, 4, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v25, v26
-; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v25, v27
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    lui a1, 61681
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v26
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    lui a1, 4112
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a1
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmul.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    lui a1, %hi(.LCPI3_3)
-; LMULMAX1-RV32-NEXT:    addi a1, a1, %lo(.LCPI3_3)
+; LMULMAX1-RV32-NEXT:    vmul.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    addi a1, zero, 56
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v26
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
@@ -837,59 +831,56 @@ define void @ctpop_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: ctpop_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI7_0)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI7_0)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    vle64.v v28, (a0)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 85
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v26, 0
+; LMULMAX2-RV32-NEXT:    vmerge.vim v30, v26, 1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v28, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v28, v30
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v30
-; LMULMAX2-RV32-NEXT:    vsub.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vand.vv v30, v30, v8
+; LMULMAX2-RV32-NEXT:    vsub.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v30, v26, 2, v0
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v28, v30
 ; LMULMAX2-RV32-NEXT:    lui a1, 209715
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v30, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI7_1)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI7_1)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v8, (a1)
+; LMULMAX2-RV32-NEXT:    vmv.v.x v8, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v8
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v28
-; LMULMAX2-RV32-NEXT:    vadd.vv v26, v30, v26
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI7_2)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI7_2)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    vand.vv v30, v30, v8
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v8
+; LMULMAX2-RV32-NEXT:    vadd.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v30, v26, 4, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v28, v30
+; LMULMAX2-RV32-NEXT:    vadd.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vand.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vand.vv v28, v28, v30
 ; LMULMAX2-RV32-NEXT:    lui a1, 4112
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vmul.vv v26, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI7_3)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI7_3)
+; LMULMAX2-RV32-NEXT:    vmul.vv v28, v28, v30
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 56
 ; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v28, v26
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -949,70 +940,67 @@ define void @ctpop_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v26, (a1)
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI7_0)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI7_0)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v27, (a2)
+; LMULMAX1-RV32-NEXT:    addi a2, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.i v27, 0
+; LMULMAX1-RV32-NEXT:    vmerge.vim v28, v27, 1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v28, v26, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vv v29, v26, v28
 ; LMULMAX1-RV32-NEXT:    lui a2, 349525
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v29, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v30, a2
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vand.vv v29, v29, v30
+; LMULMAX1-RV32-NEXT:    vsub.vv v26, v26, v29
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v29, v27, 2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v29
-; LMULMAX1-RV32-NEXT:    vsub.vv v26, v26, v28
+; LMULMAX1-RV32-NEXT:    vsrl.vv v31, v26, v29
 ; LMULMAX1-RV32-NEXT:    lui a2, 209715
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v28, a2
-; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v30, v26, v28
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI7_1)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI7_1)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v31, (a2)
+; LMULMAX1-RV32-NEXT:    vmv.v.x v8, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v26, v31
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v28
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v30, v26
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI7_2)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI7_2)
-; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v30, (a2)
+; LMULMAX1-RV32-NEXT:    vand.vv v31, v31, v8
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v8
+; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v31
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmerge.vim v31, v27, 4, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v8, v26, v30
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v8
+; LMULMAX1-RV32-NEXT:    vsrl.vv v9, v26, v31
+; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v9
 ; LMULMAX1-RV32-NEXT:    lui a2, 61681
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v8, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v8
+; LMULMAX1-RV32-NEXT:    vand.vv v26, v26, v9
 ; LMULMAX1-RV32-NEXT:    lui a2, 4112
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 257
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a2
+; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a2
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vmul.vv v26, v26, v9
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI7_3)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI7_3)
+; LMULMAX1-RV32-NEXT:    vmul.vv v26, v26, v10
+; LMULMAX1-RV32-NEXT:    addi a2, zero, 56
 ; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v10, (a2)
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v27, v27, a2, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v26, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v25, v27
-; LMULMAX1-RV32-NEXT:    vand.vv v27, v27, v29
-; LMULMAX1-RV32-NEXT:    vsub.vv v25, v25, v27
-; LMULMAX1-RV32-NEXT:    vand.vv v27, v25, v28
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v31
-; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vadd.vv v25, v27, v25
-; LMULMAX1-RV32-NEXT:    vsrl.vv v27, v25, v30
-; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vv v28, v25, v28
+; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v30
+; LMULMAX1-RV32-NEXT:    vsub.vv v25, v25, v28
+; LMULMAX1-RV32-NEXT:    vsrl.vv v28, v25, v29
+; LMULMAX1-RV32-NEXT:    vand.vv v28, v28, v8
 ; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v8
-; LMULMAX1-RV32-NEXT:    vmul.vv v25, v25, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v10
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v28
+; LMULMAX1-RV32-NEXT:    vsrl.vv v28, v25, v31
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v28
+; LMULMAX1-RV32-NEXT:    vand.vv v25, v25, v9
+; LMULMAX1-RV32-NEXT:    vmul.vv v25, v25, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vv v25, v25, v27
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse64.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index a48323916e1a3..ea9eba3ea2df8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -78,3 +78,24 @@ define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) {
   store <4 x float> %v3, <4 x float>* %x
   ret void
 }
+
+define void @buildvec_merge0_v4f32(<4 x float>* %x, float %f) {
+; CHECK-LABEL: buildvec_merge0_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, zero, 6
+; CHECK-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; CHECK-NEXT:    lui a2, %hi(.LCPI4_0)
+; CHECK-NEXT:    flw ft0, %lo(.LCPI4_0)(a2)
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT:    vfmv.v.f v25, fa0
+; CHECK-NEXT:    vfmerge.vfm v25, v25, ft0, v0
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %v0 = insertelement <4 x float> undef, float %f, i32 0
+  %v1 = insertelement <4 x float> %v0, float 2.0, i32 1
+  %v2 = insertelement <4 x float> %v1, float 2.0, i32 2
+  %v3 = insertelement <4 x float> %v2, float %f, i32 3
+  store <4 x float> %v3, <4 x float>* %x
+  ret void
+}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
index 619ba8fdab13c..6f1f72a4f07a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll
@@ -48,44 +48,35 @@ define void @splat_v4i32(<4 x i32>* %x, i32 %y) {
 define void @splat_v2i64(<2 x i64>* %x, i64 %y) {
 ; LMULMAX8-RV32-LABEL: splat_v2i64:
 ; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX8-RV32-NEXT:    .cfi_def_cfa_offset 16
-; LMULMAX8-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 0(sp)
-; LMULMAX8-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; LMULMAX8-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX8-RV32-NEXT:    addi a3, zero, 5
+; LMULMAX8-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX8-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX8-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX8-RV32-NEXT:    vmv.v.x v25, a2
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v25, v25, a1, v0
 ; LMULMAX8-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX8-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX8-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV32-LABEL: splat_v2i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
-; LMULMAX2-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a3, zero, 5
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v25, a2
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v25, v25, a1, v0
 ; LMULMAX2-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: splat_v2i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
-; LMULMAX1-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX1-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a3, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v25, a2
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v25, v25, a1, v0
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX8-RV64-LABEL: splat_v2i64:
@@ -206,74 +197,37 @@ define void @splat_v8i32(<8 x i32>* %x, i32 %y) {
 define void @splat_v4i64(<4 x i64>* %x, i64 %y) {
 ; LMULMAX8-RV32-LABEL: splat_v4i64:
 ; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX8-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX8-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX8-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; LMULMAX8-RV32-NEXT:    .cfi_offset ra, -4
-; LMULMAX8-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX8-RV32-NEXT:    addi s0, sp, 64
-; LMULMAX8-RV32-NEXT:    .cfi_def_cfa s0, 0
-; LMULMAX8-RV32-NEXT:    andi sp, sp, -32
-; LMULMAX8-RV32-NEXT:    sw a2, 28(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 24(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 20(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 16(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 0(sp)
-; LMULMAX8-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
-; LMULMAX8-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX8-RV32-NEXT:    addi a3, zero, 85
+; LMULMAX8-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX8-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX8-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX8-RV32-NEXT:    vmv.v.x v26, a2
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; LMULMAX8-RV32-NEXT:    vse32.v v26, (a0)
-; LMULMAX8-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX8-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX8-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX8-RV32-NEXT:    addi sp, sp, 64
 ; LMULMAX8-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV32-LABEL: splat_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
-; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
-; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
-; LMULMAX2-RV32-NEXT:    sw a2, 28(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 20(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
-; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a3, zero, 85
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v26, a2
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; LMULMAX2-RV32-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: splat_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
-; LMULMAX1-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX1-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
-; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a3, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a3
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v25, a2
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v25, v25, a1, v0
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX8-RV64-LABEL: splat_v4i64:
@@ -842,72 +796,23 @@ define void @splat_allones_with_use_v4i64(<4 x i64>* %x) {
 define void @vadd_vx_v16i64(<16 x i64>* %a, i64 %b, <16 x i64>* %c) {
 ; LMULMAX8-RV32-LABEL: vadd_vx_v16i64:
 ; LMULMAX8-RV32:       # %bb.0:
-; LMULMAX8-RV32-NEXT:    addi sp, sp, -256
-; LMULMAX8-RV32-NEXT:    .cfi_def_cfa_offset 256
-; LMULMAX8-RV32-NEXT:    sw ra, 252(sp) # 4-byte Folded Spill
-; LMULMAX8-RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
-; LMULMAX8-RV32-NEXT:    .cfi_offset ra, -4
-; LMULMAX8-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX8-RV32-NEXT:    addi s0, sp, 256
-; LMULMAX8-RV32-NEXT:    .cfi_def_cfa s0, 0
-; LMULMAX8-RV32-NEXT:    andi sp, sp, -128
 ; LMULMAX8-RV32-NEXT:    vsetivli a4, 16, e64,m8,ta,mu
 ; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    sw a2, 124(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 120(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 116(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 112(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 108(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 104(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 100(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 96(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 92(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 88(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 84(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 80(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 76(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 72(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 68(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 64(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 60(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 56(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 52(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 48(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 44(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 40(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 36(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 32(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 28(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 24(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 20(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 16(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX8-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX8-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX8-RV32-NEXT:    lui a0, 349525
+; LMULMAX8-RV32-NEXT:    addi a0, a0, 1365
+; LMULMAX8-RV32-NEXT:    vsetivli a4, 1, e32,m1,ta,mu
+; LMULMAX8-RV32-NEXT:    vmv.s.x v0, a0
 ; LMULMAX8-RV32-NEXT:    addi a0, zero, 32
 ; LMULMAX8-RV32-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
-; LMULMAX8-RV32-NEXT:    vle32.v v16, (sp)
+; LMULMAX8-RV32-NEXT:    vmv.v.x v16, a2
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v16, v16, a1, v0
 ; LMULMAX8-RV32-NEXT:    vsetivli a0, 16, e64,m8,ta,mu
 ; LMULMAX8-RV32-NEXT:    vadd.vv v8, v8, v16
 ; LMULMAX8-RV32-NEXT:    vse64.v v8, (a3)
-; LMULMAX8-RV32-NEXT:    addi sp, s0, -256
-; LMULMAX8-RV32-NEXT:    lw s0, 248(sp) # 4-byte Folded Reload
-; LMULMAX8-RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
-; LMULMAX8-RV32-NEXT:    addi sp, sp, 256
 ; LMULMAX8-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV32-LABEL: vadd_vx_v16i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
-; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
-; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    addi a4, a0, 64
 ; LMULMAX2-RV32-NEXT:    vsetivli a5, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v26, (a4)
@@ -916,16 +821,12 @@ define void @vadd_vx_v16i64(<16 x i64>* %a, i64 %b, <16 x i64>* %c) {
 ; LMULMAX2-RV32-NEXT:    vle64.v v30, (a0)
 ; LMULMAX2-RV32-NEXT:    addi a0, a0, 32
 ; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    sw a2, 28(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 20(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX2-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    addi a0, zero, 85
+; LMULMAX2-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-RV32-NEXT:    vsetivli a0, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v10, (sp)
+; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a2
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vadd.vv v30, v30, v10
@@ -938,16 +839,10 @@ define void @vadd_vx_v16i64(<16 x i64>* %a, i64 %b, <16 x i64>* %c) {
 ; LMULMAX2-RV32-NEXT:    vse64.v v30, (a3)
 ; LMULMAX2-RV32-NEXT:    addi a0, a3, 32
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: vadd_vx_v16i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX1-RV32-NEXT:    addi a4, a0, 96
 ; LMULMAX1-RV32-NEXT:    vsetivli a5, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a4)
@@ -964,12 +859,12 @@ define void @vadd_vx_v16i64(<16 x i64>* %a, i64 %b, <16 x i64>* %c) {
 ; LMULMAX1-RV32-NEXT:    vle64.v v31, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a0, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    sw a2, 12(sp)
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
-; LMULMAX1-RV32-NEXT:    sw a2, 4(sp)
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    addi a0, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a4, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a0
 ; LMULMAX1-RV32-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v9, (sp)
+; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a2
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v9
 ; LMULMAX1-RV32-NEXT:    vadd.vv v31, v31, v9
@@ -994,7 +889,6 @@ define void @vadd_vx_v16i64(<16 x i64>* %a, i64 %b, <16 x i64>* %c) {
 ; LMULMAX1-RV32-NEXT:    vse64.v v31, (a3)
 ; LMULMAX1-RV32-NEXT:    addi a0, a3, 16
 ; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX8-RV64-LABEL: vadd_vx_v16i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 84784ee82c1ce..8c82c1238eace 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -837,30 +837,99 @@ define void @urem_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 }
 
 define void @mulhu_v16i8(<16 x i8>* %x) {
-; CHECK-LABEL: mulhu_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
-; CHECK-NEXT:    vle8.v v25, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI52_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI52_0)
-; CHECK-NEXT:    vle8.v v26, (a1)
-; CHECK-NEXT:    lui a1, %hi(.LCPI52_1)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI52_1)
-; CHECK-NEXT:    vle8.v v27, (a1)
-; CHECK-NEXT:    vsrl.vv v26, v25, v26
-; CHECK-NEXT:    vmulhu.vv v26, v26, v27
-; CHECK-NEXT:    lui a1, %hi(.LCPI52_2)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI52_2)
-; CHECK-NEXT:    vle8.v v27, (a1)
-; CHECK-NEXT:    lui a1, %hi(.LCPI52_3)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI52_3)
-; CHECK-NEXT:    vle8.v v28, (a1)
-; CHECK-NEXT:    vsub.vv v25, v25, v26
-; CHECK-NEXT:    vmulhu.vv v25, v25, v27
-; CHECK-NEXT:    vadd.vv v25, v25, v26
-; CHECK-NEXT:    vsrl.vv v25, v25, v28
-; CHECK-NEXT:    vse8.v v25, (a0)
-; CHECK-NEXT:    ret
+; RV32-LABEL: mulhu_v16i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV32-NEXT:    vle8.v v25, (a0)
+; RV32-NEXT:    addi a1, zero, 513
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 4
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:    addi a2, a1, 78
+; RV32-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; RV32-NEXT:    vmerge.vim v26, v26, 3, v0
+; RV32-NEXT:    lui a2, 8
+; RV32-NEXT:    addi a2, a2, 304
+; RV32-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; RV32-NEXT:    vmerge.vim v26, v26, 2, v0
+; RV32-NEXT:    lui a2, 3
+; RV32-NEXT:    addi a2, a2, -2044
+; RV32-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v27, 0
+; RV32-NEXT:    addi a2, zero, -128
+; RV32-NEXT:    vmerge.vxm v28, v27, a2, v0
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV32-NEXT:    lui a1, %hi(.LCPI52_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI52_0)
+; RV32-NEXT:    vle8.v v29, (a1)
+; RV32-NEXT:    vmerge.vim v27, v27, 1, v0
+; RV32-NEXT:    vsrl.vv v27, v25, v27
+; RV32-NEXT:    vmulhu.vv v27, v27, v29
+; RV32-NEXT:    vsub.vv v25, v25, v27
+; RV32-NEXT:    vmulhu.vv v25, v25, v28
+; RV32-NEXT:    vadd.vv v25, v25, v27
+; RV32-NEXT:    vsrl.vv v25, v25, v26
+; RV32-NEXT:    vse8.v v25, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mulhu_v16i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV64-NEXT:    vle8.v v25, (a0)
+; RV64-NEXT:    addi a1, zero, 513
+; RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV64-NEXT:    vmv.v.i v26, 4
+; RV64-NEXT:    vmerge.vim v26, v26, 1, v0
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:    addiw a2, a1, 78
+; RV64-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a2
+; RV64-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; RV64-NEXT:    vmerge.vim v26, v26, 3, v0
+; RV64-NEXT:    lui a2, 8
+; RV64-NEXT:    addiw a2, a2, 304
+; RV64-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a2
+; RV64-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; RV64-NEXT:    vmerge.vim v26, v26, 2, v0
+; RV64-NEXT:    lui a2, 3
+; RV64-NEXT:    addiw a2, a2, -2044
+; RV64-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a2
+; RV64-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; RV64-NEXT:    vmv.v.i v27, 0
+; RV64-NEXT:    addi a2, zero, -128
+; RV64-NEXT:    vmerge.vxm v28, v27, a2, v0
+; RV64-NEXT:    addiw a1, a1, 32
+; RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV64-NEXT:    lui a1, %hi(.LCPI52_0)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI52_0)
+; RV64-NEXT:    vle8.v v29, (a1)
+; RV64-NEXT:    vmerge.vim v27, v27, 1, v0
+; RV64-NEXT:    vsrl.vv v27, v25, v27
+; RV64-NEXT:    vmulhu.vv v27, v27, v29
+; RV64-NEXT:    vsub.vv v25, v25, v27
+; RV64-NEXT:    vmulhu.vv v25, v25, v28
+; RV64-NEXT:    vadd.vv v25, v25, v27
+; RV64-NEXT:    vsrl.vv v25, v25, v26
+; RV64-NEXT:    vse8.v v25, (a0)
+; RV64-NEXT:    ret
   %a = load <16 x i8>, <16 x i8>* %x
   %b = udiv <16 x i8> %a, <i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25>
   store <16 x i8> %b, <16 x i8>* %x
@@ -872,27 +941,33 @@ define void @mulhu_v8i16(<8 x i16>* %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
 ; CHECK-NEXT:    vle16.v v25, (a0)
-; CHECK-NEXT:    vmv.v.i v26, 0
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    vmv1r.v v27, v26
-; CHECK-NEXT:    vmv.s.x v27, a1
 ; CHECK-NEXT:    addi a1, zero, 1
-; CHECK-NEXT:    vmv.s.x v28, a1
+; CHECK-NEXT:    vmv.s.x v26, a1
+; CHECK-NEXT:    addi a1, zero, 33
+; CHECK-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; CHECK-NEXT:    vmv.v.i v27, 3
+; CHECK-NEXT:    vmerge.vim v27, v27, 2, v0
+; CHECK-NEXT:    vsetivli a1, 7, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v27, v26, 6
+; CHECK-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; CHECK-NEXT:    vmv.v.i v28, 0
+; CHECK-NEXT:    lui a1, 1048568
+; CHECK-NEXT:    vmv1r.v v29, v28
+; CHECK-NEXT:    vmv.s.x v29, a1
 ; CHECK-NEXT:    vsetivli a1, 7, e16,m1,tu,mu
-; CHECK-NEXT:    vslideup.vi v26, v28, 6
+; CHECK-NEXT:    vslideup.vi v28, v26, 6
 ; CHECK-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
 ; CHECK-NEXT:    lui a1, %hi(.LCPI53_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI53_0)
-; CHECK-NEXT:    vle16.v v28, (a1)
-; CHECK-NEXT:    vsrl.vv v26, v25, v26
-; CHECK-NEXT:    vmulhu.vv v26, v26, v28
-; CHECK-NEXT:    lui a1, %hi(.LCPI53_1)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI53_1)
-; CHECK-NEXT:    vle16.v v28, (a1)
+; CHECK-NEXT:    vle16.v v26, (a1)
+; CHECK-NEXT:    vsrl.vv v28, v25, v28
+; CHECK-NEXT:    vmulhu.vv v26, v28, v26
 ; CHECK-NEXT:    vsub.vv v25, v25, v26
-; CHECK-NEXT:    vmulhu.vv v25, v25, v27
+; CHECK-NEXT:    vmulhu.vv v25, v25, v29
 ; CHECK-NEXT:    vadd.vv v25, v25, v26
-; CHECK-NEXT:    vsrl.vv v25, v25, v28
+; CHECK-NEXT:    vsrl.vv v25, v25, v27
 ; CHECK-NEXT:    vse16.v v25, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x i16>, <8 x i16>* %x
@@ -990,20 +1065,45 @@ define void @mulhu_v2i64(<2 x i64>* %x) {
 }
 
 define void @mulhs_v16i8(<16 x i8>* %x) {
-; CHECK-LABEL: mulhs_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
-; CHECK-NEXT:    vle8.v v25, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI56_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI56_0)
-; CHECK-NEXT:    vle8.v v26, (a1)
-; CHECK-NEXT:    lui a1, %hi(.LCPI56_1)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI56_1)
-; CHECK-NEXT:    vle8.v v27, (a1)
-; CHECK-NEXT:    vmulhu.vv v25, v25, v26
-; CHECK-NEXT:    vsrl.vv v25, v25, v27
-; CHECK-NEXT:    vse8.v v25, (a0)
-; CHECK-NEXT:    ret
+; RV32-LABEL: mulhs_v16i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV32-NEXT:    vle8.v v25, (a0)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, -1452
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 7
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
+; RV32-NEXT:    addi a1, zero, -123
+; RV32-NEXT:    vmv.v.x v27, a1
+; RV32-NEXT:    addi a1, zero, 57
+; RV32-NEXT:    vmerge.vxm v27, v27, a1, v0
+; RV32-NEXT:    vmulhu.vv v25, v25, v27
+; RV32-NEXT:    vsrl.vv v25, v25, v26
+; RV32-NEXT:    vse8.v v25, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mulhs_v16i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV64-NEXT:    vle8.v v25, (a0)
+; RV64-NEXT:    lui a1, 5
+; RV64-NEXT:    addiw a1, a1, -1452
+; RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; RV64-NEXT:    vmv.v.i v26, 7
+; RV64-NEXT:    vmerge.vim v26, v26, 1, v0
+; RV64-NEXT:    addi a1, zero, -123
+; RV64-NEXT:    vmv.v.x v27, a1
+; RV64-NEXT:    addi a1, zero, 57
+; RV64-NEXT:    vmerge.vxm v27, v27, a1, v0
+; RV64-NEXT:    vmulhu.vv v25, v25, v27
+; RV64-NEXT:    vsrl.vv v25, v25, v26
+; RV64-NEXT:    vse8.v v25, (a0)
+; RV64-NEXT:    ret
   %a = load <16 x i8>, <16 x i8>* %x
   %b = udiv <16 x i8> %a, <i8 -9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 -9, i8 9, i8 -9>
   store <16 x i8> %b, <16 x i8>* %x
@@ -1011,19 +1111,47 @@ define void @mulhs_v16i8(<16 x i8>* %x) {
 }
 
 define void @mulhs_v8i16(<8 x i16>* %x) {
-; CHECK-LABEL: mulhs_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
-; CHECK-NEXT:    vle16.v v25, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI57_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI57_0)
-; CHECK-NEXT:    vle16.v v26, (a1)
-; CHECK-NEXT:    vmulh.vv v25, v25, v26
-; CHECK-NEXT:    vsra.vi v25, v25, 1
-; CHECK-NEXT:    vsrl.vi v26, v25, 15
-; CHECK-NEXT:    vadd.vv v25, v25, v26
-; CHECK-NEXT:    vse16.v v25, (a0)
-; CHECK-NEXT:    ret
+; RV32-LABEL: mulhs_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    addi a1, zero, 105
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, -1755
+; RV32-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; RV32-NEXT:    vmv.v.x v26, a1
+; RV32-NEXT:    lui a1, 1048571
+; RV32-NEXT:    addi a1, a1, 1755
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
+; RV32-NEXT:    vmulh.vv v25, v25, v26
+; RV32-NEXT:    vsra.vi v25, v25, 1
+; RV32-NEXT:    vsrl.vi v26, v25, 15
+; RV32-NEXT:    vadd.vv v25, v25, v26
+; RV32-NEXT:    vse16.v v25, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mulhs_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV64-NEXT:    vle16.v v25, (a0)
+; RV64-NEXT:    addi a1, zero, 105
+; RV64-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    lui a1, 5
+; RV64-NEXT:    addiw a1, a1, -1755
+; RV64-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; RV64-NEXT:    vmv.v.x v26, a1
+; RV64-NEXT:    lui a1, 1048571
+; RV64-NEXT:    addiw a1, a1, 1755
+; RV64-NEXT:    vmerge.vxm v26, v26, a1, v0
+; RV64-NEXT:    vmulh.vv v25, v25, v26
+; RV64-NEXT:    vsra.vi v25, v25, 1
+; RV64-NEXT:    vsrl.vi v26, v25, 15
+; RV64-NEXT:    vadd.vv v25, v25, v26
+; RV64-NEXT:    vse16.v v25, (a0)
+; RV64-NEXT:    ret
   %a = load <8 x i16>, <8 x i16>* %x
   %b = sdiv <8 x i16> %a, <i16 -7, i16 7, i16 7, i16 -7, i16 7, i16 -7, i16 -7, i16 7>
   store <8 x i16> %b, <8 x i16>* %x
@@ -1035,9 +1163,16 @@ define void @mulhs_v4i32(<4 x i32>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
 ; RV32-NEXT:    vle32.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI58_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI58_0)
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    lui a1, 419430
+; RV32-NEXT:    addi a1, a1, 1639
+; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.x v26, a1
+; RV32-NEXT:    lui a1, 629146
+; RV32-NEXT:    addi a1, a1, -1639
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vmulh.vv v25, v25, v26
 ; RV32-NEXT:    vsrl.vi v26, v25, 31
 ; RV32-NEXT:    vsra.vi v25, v25, 1
@@ -1049,9 +1184,16 @@ define void @mulhs_v4i32(<4 x i32>* %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
 ; RV64-NEXT:    vle32.v v25, (a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI58_0)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI58_0)
-; RV64-NEXT:    vle32.v v26, (a1)
+; RV64-NEXT:    addi a1, zero, 5
+; RV64-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    lui a1, 419430
+; RV64-NEXT:    addiw a1, a1, 1639
+; RV64-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; RV64-NEXT:    vmv.v.x v26, a1
+; RV64-NEXT:    lui a1, 629146
+; RV64-NEXT:    addiw a1, a1, -1639
+; RV64-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV64-NEXT:    vmulh.vv v25, v25, v26
 ; RV64-NEXT:    vsra.vi v25, v25, 1
 ; RV64-NEXT:    vsrl.vi v26, v25, 31
@@ -1069,36 +1211,40 @@ define void @mulhs_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI59_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI59_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
-; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; RV32-NEXT:    vmul.vv v26, v25, v26
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a2, a1, 1365
 ; RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
-; RV32-NEXT:    vmv.v.x v27, a2
+; RV32-NEXT:    vmv.v.x v26, a2
 ; RV32-NEXT:    addi a1, a1, 1366
-; RV32-NEXT:    vmv.s.x v27, a1
+; RV32-NEXT:    vmv.s.x v26, a1
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; RV32-NEXT:    vmulh.vv v25, v25, v27
-; RV32-NEXT:    vadd.vv v25, v25, v26
-; RV32-NEXT:    lui a1, %hi(.LCPI59_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI59_1)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
-; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; RV32-NEXT:    vsrl.vv v26, v25, v26
+; RV32-NEXT:    vmulh.vv v26, v25, v26
 ; RV32-NEXT:    addi a1, zero, 1
+; RV32-NEXT:    addi a2, zero, 3
+; RV32-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v27, -1
+; RV32-NEXT:    vmerge.vim v27, v27, 0, v0
+; RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; RV32-NEXT:    vmul.vv v25, v25, v27
+; RV32-NEXT:    vadd.vv v25, v26, v25
+; RV32-NEXT:    addi a2, zero, 5
+; RV32-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a2
 ; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vmv.s.x v27, a1
-; RV32-NEXT:    vmv.v.i v28, 0
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    addi a2, zero, 63
+; RV32-NEXT:    vmerge.vxm v27, v26, a2, v0
+; RV32-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; RV32-NEXT:    vsrl.vv v27, v25, v27
+; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v28, a1
 ; RV32-NEXT:    vsetivli a1, 3, e32,m1,tu,mu
-; RV32-NEXT:    vslideup.vi v28, v27, 2
+; RV32-NEXT:    vslideup.vi v26, v28, 2
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
-; RV32-NEXT:    vsra.vv v25, v25, v28
-; RV32-NEXT:    vadd.vv v25, v25, v26
+; RV32-NEXT:    vsra.vv v25, v25, v26
+; RV32-NEXT:    vadd.vv v25, v25, v27
 ; RV32-NEXT:    vse64.v v25, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -3848,31 +3994,105 @@ define void @extract_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 }
 
 define void @mulhu_v32i8(<32 x i8>* %x) {
-; LMULMAX2-LABEL: mulhu_v32i8:
-; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    addi a1, zero, 32
-; LMULMAX2-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
-; LMULMAX2-NEXT:    vle8.v v26, (a0)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI129_0)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI129_0)
-; LMULMAX2-NEXT:    vle8.v v28, (a1)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI129_1)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI129_1)
-; LMULMAX2-NEXT:    vle8.v v30, (a1)
-; LMULMAX2-NEXT:    vsrl.vv v28, v26, v28
-; LMULMAX2-NEXT:    vmulhu.vv v28, v28, v30
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI129_2)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI129_2)
-; LMULMAX2-NEXT:    vle8.v v30, (a1)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI129_3)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI129_3)
-; LMULMAX2-NEXT:    vle8.v v8, (a1)
-; LMULMAX2-NEXT:    vsub.vv v26, v26, v28
-; LMULMAX2-NEXT:    vmulhu.vv v26, v26, v30
-; LMULMAX2-NEXT:    vadd.vv v26, v26, v28
-; LMULMAX2-NEXT:    vsrl.vv v26, v26, v8
-; LMULMAX2-NEXT:    vse8.v v26, (a0)
-; LMULMAX2-NEXT:    ret
+; LMULMAX2-RV32-LABEL: mulhu_v32i8:
+; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 32
+; LMULMAX2-RV32-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vle8.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    lui a2, 8208
+; LMULMAX2-RV32-NEXT:    addi a2, a2, 513
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v28, 4
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-RV32-NEXT:    lui a2, 66785
+; LMULMAX2-RV32-NEXT:    addi a2, a2, 78
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 3, v0
+; LMULMAX2-RV32-NEXT:    lui a2, 529160
+; LMULMAX2-RV32-NEXT:    addi a2, a2, 304
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 2, v0
+; LMULMAX2-RV32-NEXT:    lui a2, 163907
+; LMULMAX2-RV32-NEXT:    addi a2, a2, -2044
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v30, 0
+; LMULMAX2-RV32-NEXT:    addi a2, zero, -128
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v8, v30, a2, v0
+; LMULMAX2-RV32-NEXT:    lui a2, 66049
+; LMULMAX2-RV32-NEXT:    addi a2, a2, 32
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI129_0)
+; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI129_0)
+; LMULMAX2-RV32-NEXT:    vle8.v v10, (a1)
+; LMULMAX2-RV32-NEXT:    vmerge.vim v30, v30, 1, v0
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v26, v30
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v30, v30, v10
+; LMULMAX2-RV32-NEXT:    vsub.vv v26, v26, v30
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v26, v26, v8
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v30
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vse8.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    ret
+;
+; LMULMAX2-RV64-LABEL: mulhu_v32i8:
+; LMULMAX2-RV64:       # %bb.0:
+; LMULMAX2-RV64-NEXT:    addi a1, zero, 32
+; LMULMAX2-RV64-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vle8.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    lui a2, 8208
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, 513
+; LMULMAX2-RV64-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.i v28, 4
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-RV64-NEXT:    lui a2, 66785
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, 78
+; LMULMAX2-RV64-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 3, v0
+; LMULMAX2-RV64-NEXT:    lui a2, 529160
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, 304
+; LMULMAX2-RV64-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 2, v0
+; LMULMAX2-RV64-NEXT:    lui a2, 163907
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, -2044
+; LMULMAX2-RV64-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.i v30, 0
+; LMULMAX2-RV64-NEXT:    addi a2, zero, -128
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v8, v30, a2, v0
+; LMULMAX2-RV64-NEXT:    lui a2, 66049
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, 32
+; LMULMAX2-RV64-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI129_0)
+; LMULMAX2-RV64-NEXT:    addi a1, a1, %lo(.LCPI129_0)
+; LMULMAX2-RV64-NEXT:    vle8.v v10, (a1)
+; LMULMAX2-RV64-NEXT:    vmerge.vim v30, v30, 1, v0
+; LMULMAX2-RV64-NEXT:    vsrl.vv v30, v26, v30
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v30, v30, v10
+; LMULMAX2-RV64-NEXT:    vsub.vv v26, v26, v30
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v26, v26, v8
+; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v30
+; LMULMAX2-RV64-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vse8.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-LABEL: mulhu_v32i8:
 ; LMULMAX1:       # %bb.0:
@@ -3895,30 +4115,83 @@ define void @mulhu_v32i8(<32 x i8>* %x) {
 }
 
 define void @mulhu_v16i16(<16 x i16>* %x) {
-; LMULMAX2-LABEL: mulhu_v16i16:
-; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
-; LMULMAX2-NEXT:    vle16.v v26, (a0)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI130_0)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI130_0)
-; LMULMAX2-NEXT:    vle16.v v28, (a1)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI130_1)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI130_1)
-; LMULMAX2-NEXT:    vle16.v v30, (a1)
-; LMULMAX2-NEXT:    vsrl.vv v28, v26, v28
-; LMULMAX2-NEXT:    vmulhu.vv v28, v28, v30
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI130_2)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI130_2)
-; LMULMAX2-NEXT:    vle16.v v30, (a1)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI130_3)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI130_3)
-; LMULMAX2-NEXT:    vle16.v v8, (a1)
-; LMULMAX2-NEXT:    vsub.vv v26, v26, v28
-; LMULMAX2-NEXT:    vmulhu.vv v26, v26, v30
-; LMULMAX2-NEXT:    vadd.vv v26, v26, v28
-; LMULMAX2-NEXT:    vsrl.vv v26, v26, v8
-; LMULMAX2-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-NEXT:    ret
+; LMULMAX2-RV32-LABEL: mulhu_v16i16:
+; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vle16.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    lui a1, 2
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 289
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v28, 3
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 2, v0
+; LMULMAX2-RV32-NEXT:    lui a1, 4
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 64
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-RV32-NEXT:    vmv1r.v v12, v0
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 257
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v30, 0
+; LMULMAX2-RV32-NEXT:    lui a1, 1048568
+; LMULMAX2-RV32-NEXT:    lui a2, %hi(.LCPI130_0)
+; LMULMAX2-RV32-NEXT:    addi a2, a2, %lo(.LCPI130_0)
+; LMULMAX2-RV32-NEXT:    vle16.v v8, (a2)
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v10, v30, a1, v0
+; LMULMAX2-RV32-NEXT:    vmv1r.v v0, v12
+; LMULMAX2-RV32-NEXT:    vmerge.vim v30, v30, 1, v0
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v26, v30
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v30, v30, v8
+; LMULMAX2-RV32-NEXT:    vsub.vv v26, v26, v30
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v26, v26, v10
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v30
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vse16.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    ret
+;
+; LMULMAX2-RV64-LABEL: mulhu_v16i16:
+; LMULMAX2-RV64:       # %bb.0:
+; LMULMAX2-RV64-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vle16.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    lui a1, 2
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, 289
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.i v28, 3
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 2, v0
+; LMULMAX2-RV64-NEXT:    lui a1, 4
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, 64
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-RV64-NEXT:    vmv1r.v v12, v0
+; LMULMAX2-RV64-NEXT:    addi a1, zero, 257
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.i v30, 0
+; LMULMAX2-RV64-NEXT:    lui a1, 1048568
+; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI130_0)
+; LMULMAX2-RV64-NEXT:    addi a2, a2, %lo(.LCPI130_0)
+; LMULMAX2-RV64-NEXT:    vle16.v v8, (a2)
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v10, v30, a1, v0
+; LMULMAX2-RV64-NEXT:    vmv1r.v v0, v12
+; LMULMAX2-RV64-NEXT:    vmerge.vim v30, v30, 1, v0
+; LMULMAX2-RV64-NEXT:    vsrl.vv v30, v26, v30
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v30, v30, v8
+; LMULMAX2-RV64-NEXT:    vsub.vv v26, v26, v30
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v26, v26, v10
+; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v30
+; LMULMAX2-RV64-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vse16.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-LABEL: mulhu_v16i16:
 ; LMULMAX1:       # %bb.0:
@@ -3945,20 +4218,27 @@ define void @mulhu_v8i32(<8 x i32>* %x) {
 ; LMULMAX2:       # %bb.0:
 ; LMULMAX2-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
 ; LMULMAX2-NEXT:    vle32.v v26, (a0)
+; LMULMAX2-NEXT:    addi a1, zero, 68
+; LMULMAX2-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
 ; LMULMAX2-NEXT:    lui a1, %hi(.LCPI131_0)
 ; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI131_0)
 ; LMULMAX2-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-NEXT:    vmv.v.i v30, 0
+; LMULMAX2-NEXT:    lui a1, 524288
+; LMULMAX2-NEXT:    vmerge.vxm v30, v30, a1, v0
 ; LMULMAX2-NEXT:    vmulhu.vv v28, v26, v28
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI131_1)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI131_1)
-; LMULMAX2-NEXT:    vle32.v v30, (a1)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI131_2)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI131_2)
-; LMULMAX2-NEXT:    vle32.v v8, (a1)
 ; LMULMAX2-NEXT:    vsub.vv v26, v26, v28
 ; LMULMAX2-NEXT:    vmulhu.vv v26, v26, v30
 ; LMULMAX2-NEXT:    vadd.vv v26, v26, v28
-; LMULMAX2-NEXT:    vsrl.vv v26, v26, v8
+; LMULMAX2-NEXT:    addi a1, zero, 136
+; LMULMAX2-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT:    vmv.v.i v28, 2
+; LMULMAX2-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-NEXT:    vsrl.vv v26, v26, v28
 ; LMULMAX2-NEXT:    vse32.v v26, (a0)
 ; LMULMAX2-NEXT:    ret
 ;
@@ -4163,36 +4443,85 @@ define void @mulhu_v4i64(<4 x i64>* %x) {
 }
 
 define void @mulhs_v32i8(<32 x i8>* %x) {
-; LMULMAX2-LABEL: mulhs_v32i8:
-; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    addi a1, zero, 32
-; LMULMAX2-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
-; LMULMAX2-NEXT:    vle8.v v26, (a0)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI133_0)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI133_0)
-; LMULMAX2-NEXT:    vle8.v v28, (a1)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI133_1)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI133_1)
-; LMULMAX2-NEXT:    vle8.v v30, (a1)
-; LMULMAX2-NEXT:    vmulhu.vv v26, v26, v28
-; LMULMAX2-NEXT:    vsrl.vv v26, v26, v30
-; LMULMAX2-NEXT:    vse8.v v26, (a0)
-; LMULMAX2-NEXT:    ret
+; LMULMAX2-RV32-LABEL: mulhs_v32i8:
+; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 32
+; LMULMAX2-RV32-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vle8.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    lui a2, 304453
+; LMULMAX2-RV32-NEXT:    addi a2, a2, -1452
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV32-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v28, 7
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-RV32-NEXT:    addi a1, zero, -123
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a1
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 57
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v30, v30, a1, v0
+; LMULMAX2-RV32-NEXT:    vmulhu.vv v26, v26, v30
+; LMULMAX2-RV32-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vse8.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    ret
 ;
-; LMULMAX1-LABEL: mulhs_v32i8:
-; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
-; LMULMAX1-NEXT:    addi a1, a0, 16
-; LMULMAX1-NEXT:    vle8.v v25, (a1)
-; LMULMAX1-NEXT:    lui a2, %hi(.LCPI133_0)
-; LMULMAX1-NEXT:    addi a2, a2, %lo(.LCPI133_0)
-; LMULMAX1-NEXT:    vle8.v v26, (a2)
-; LMULMAX1-NEXT:    vle8.v v27, (a0)
-; LMULMAX1-NEXT:    vdivu.vv v25, v25, v26
-; LMULMAX1-NEXT:    vdivu.vv v26, v27, v26
-; LMULMAX1-NEXT:    vse8.v v26, (a0)
-; LMULMAX1-NEXT:    vse8.v v25, (a1)
-; LMULMAX1-NEXT:    ret
+; LMULMAX2-RV64-LABEL: mulhs_v32i8:
+; LMULMAX2-RV64:       # %bb.0:
+; LMULMAX2-RV64-NEXT:    addi a1, zero, 32
+; LMULMAX2-RV64-NEXT:    vsetvli a2, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vle8.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    lui a2, 304453
+; LMULMAX2-RV64-NEXT:    addiw a2, a2, -1452
+; LMULMAX2-RV64-NEXT:    vsetivli a3, 1, e32,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX2-RV64-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.i v28, 7
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 1, v0
+; LMULMAX2-RV64-NEXT:    addi a1, zero, -123
+; LMULMAX2-RV64-NEXT:    vmv.v.x v30, a1
+; LMULMAX2-RV64-NEXT:    addi a1, zero, 57
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v30, v30, a1, v0
+; LMULMAX2-RV64-NEXT:    vmulhu.vv v26, v26, v30
+; LMULMAX2-RV64-NEXT:    vsrl.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vse8.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    ret
+;
+; LMULMAX1-RV32-LABEL: mulhs_v32i8:
+; LMULMAX1-RV32:       # %bb.0:
+; LMULMAX1-RV32-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vle8.v v25, (a0)
+; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
+; LMULMAX1-RV32-NEXT:    vle8.v v26, (a1)
+; LMULMAX1-RV32-NEXT:    lui a2, 5
+; LMULMAX1-RV32-NEXT:    addi a2, a2, -1452
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV32-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.i v27, -9
+; LMULMAX1-RV32-NEXT:    vmerge.vim v27, v27, 9, v0
+; LMULMAX1-RV32-NEXT:    vdivu.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vdivu.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vse8.v v25, (a0)
+; LMULMAX1-RV32-NEXT:    vse8.v v26, (a1)
+; LMULMAX1-RV32-NEXT:    ret
+;
+; LMULMAX1-RV64-LABEL: mulhs_v32i8:
+; LMULMAX1-RV64:       # %bb.0:
+; LMULMAX1-RV64-NEXT:    vsetivli a1, 16, e8,m1,ta,mu
+; LMULMAX1-RV64-NEXT:    vle8.v v25, (a0)
+; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
+; LMULMAX1-RV64-NEXT:    vle8.v v26, (a1)
+; LMULMAX1-RV64-NEXT:    lui a2, 5
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -1452
+; LMULMAX1-RV64-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
+; LMULMAX1-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV64-NEXT:    vsetivli a2, 16, e8,m1,ta,mu
+; LMULMAX1-RV64-NEXT:    vmv.v.i v27, -9
+; LMULMAX1-RV64-NEXT:    vmerge.vim v27, v27, 9, v0
+; LMULMAX1-RV64-NEXT:    vdivu.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    vdivu.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vse8.v v25, (a0)
+; LMULMAX1-RV64-NEXT:    vse8.v v26, (a1)
+; LMULMAX1-RV64-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %x
   %b = udiv <32 x i8> %a, <i8 -9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 -9, i8 9, i8 -9, i8 -9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 -9, i8 9, i8 -9, i8 9, i8 -9, i8 -9, i8 9, i8 -9>
   store <32 x i8> %b, <32 x i8>* %x
@@ -4200,33 +4529,66 @@ define void @mulhs_v32i8(<32 x i8>* %x) {
 }
 
 define void @mulhs_v16i16(<16 x i16>* %x) {
-; LMULMAX2-LABEL: mulhs_v16i16:
-; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
-; LMULMAX2-NEXT:    vle16.v v26, (a0)
-; LMULMAX2-NEXT:    lui a1, %hi(.LCPI134_0)
-; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI134_0)
-; LMULMAX2-NEXT:    vle16.v v28, (a1)
-; LMULMAX2-NEXT:    vmulh.vv v26, v26, v28
-; LMULMAX2-NEXT:    vsra.vi v26, v26, 1
-; LMULMAX2-NEXT:    vsrl.vi v28, v26, 15
-; LMULMAX2-NEXT:    vadd.vv v26, v26, v28
-; LMULMAX2-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-NEXT:    ret
+; LMULMAX2-RV32-LABEL: mulhs_v16i16:
+; LMULMAX2-RV32:       # %bb.0:
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vle16.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    lui a1, 7
+; LMULMAX2-RV32-NEXT:    addi a1, a1, -1687
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    lui a1, 5
+; LMULMAX2-RV32-NEXT:    addi a1, a1, -1755
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 16, e16,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    lui a1, 1048571
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 1755
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v28, v28, a1, v0
+; LMULMAX2-RV32-NEXT:    vmulh.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsra.vi v26, v26, 1
+; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 15
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vse16.v v26, (a0)
+; LMULMAX2-RV32-NEXT:    ret
+;
+; LMULMAX2-RV64-LABEL: mulhs_v16i16:
+; LMULMAX2-RV64:       # %bb.0:
+; LMULMAX2-RV64-NEXT:    vsetivli a1, 16, e16,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vle16.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    lui a1, 7
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1687
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    lui a1, 5
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1755
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 16, e16,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV64-NEXT:    lui a1, 1048571
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1755
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v28, v28, a1, v0
+; LMULMAX2-RV64-NEXT:    vmulh.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vsra.vi v26, v26, 1
+; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 15
+; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v28
+; LMULMAX2-RV64-NEXT:    vse16.v v26, (a0)
+; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-LABEL: mulhs_v16i16:
 ; LMULMAX1:       # %bb.0:
 ; LMULMAX1-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    addi a1, a0, 16
-; LMULMAX1-NEXT:    vle16.v v25, (a1)
-; LMULMAX1-NEXT:    lui a2, %hi(.LCPI134_0)
-; LMULMAX1-NEXT:    addi a2, a2, %lo(.LCPI134_0)
-; LMULMAX1-NEXT:    vle16.v v26, (a2)
-; LMULMAX1-NEXT:    vle16.v v27, (a0)
-; LMULMAX1-NEXT:    vdiv.vv v25, v25, v26
-; LMULMAX1-NEXT:    vdiv.vv v26, v27, v26
-; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vse16.v v25, (a1)
+; LMULMAX1-NEXT:    vle16.v v26, (a1)
+; LMULMAX1-NEXT:    addi a2, zero, 105
+; LMULMAX1-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; LMULMAX1-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-NEXT:    vsetivli a2, 8, e16,m1,ta,mu
+; LMULMAX1-NEXT:    vmv.v.i v27, 7
+; LMULMAX1-NEXT:    vmerge.vim v27, v27, -7, v0
+; LMULMAX1-NEXT:    vdiv.vv v26, v26, v27
+; LMULMAX1-NEXT:    vdiv.vv v25, v25, v27
+; LMULMAX1-NEXT:    vse16.v v25, (a0)
+; LMULMAX1-NEXT:    vse16.v v26, (a1)
 ; LMULMAX1-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %x
   %b = sdiv <16 x i16> %a, <i16 -7, i16 7, i16 7, i16 -7, i16 7, i16 -7, i16 -7, i16 7, i16 -7, i16 7, i16 7, i16 -7, i16 7, i16 -7, i16 -7, i16 7>
@@ -4239,9 +4601,16 @@ define void @mulhs_v8i32(<8 x i32>* %x) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle32.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI135_0)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI135_0)
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 85
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    lui a1, 419430
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 1639
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV32-NEXT:    lui a1, 629146
+; LMULMAX2-RV32-NEXT:    addi a1, a1, -1639
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v28, v28, a1, v0
 ; LMULMAX2-RV32-NEXT:    vmulh.vv v26, v26, v28
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v28, v26, 31
 ; LMULMAX2-RV32-NEXT:    vsra.vi v26, v26, 1
@@ -4253,9 +4622,16 @@ define void @mulhs_v8i32(<8 x i32>* %x) {
 ; LMULMAX2-RV64:       # %bb.0:
 ; LMULMAX2-RV64-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle32.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI135_0)
-; LMULMAX2-RV64-NEXT:    addi a1, a1, %lo(.LCPI135_0)
-; LMULMAX2-RV64-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV64-NEXT:    addi a1, zero, 85
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    lui a1, 419430
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1639
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.x v28, a1
+; LMULMAX2-RV64-NEXT:    lui a1, 629146
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -1639
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v28, v28, a1, v0
 ; LMULMAX2-RV64-NEXT:    vmulh.vv v26, v26, v28
 ; LMULMAX2-RV64-NEXT:    vsra.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v28, v26, 31
@@ -4266,37 +4642,47 @@ define void @mulhs_v8i32(<8 x i32>* %x) {
 ; LMULMAX1-RV32-LABEL: mulhs_v8i32:
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
-; LMULMAX1-RV32-NEXT:    lui a2, %hi(.LCPI135_0)
-; LMULMAX1-RV32-NEXT:    addi a2, a2, %lo(.LCPI135_0)
-; LMULMAX1-RV32-NEXT:    vle32.v v26, (a2)
-; LMULMAX1-RV32-NEXT:    vle32.v v27, (a0)
-; LMULMAX1-RV32-NEXT:    vmulh.vv v25, v25, v26
-; LMULMAX1-RV32-NEXT:    vsrl.vi v28, v25, 31
-; LMULMAX1-RV32-NEXT:    vsra.vi v25, v25, 1
-; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v28
-; LMULMAX1-RV32-NEXT:    vmulh.vv v26, v27, v26
-; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v26, 31
+; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
+; LMULMAX1-RV32-NEXT:    addi a2, zero, 5
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV32-NEXT:    lui a2, 419430
+; LMULMAX1-RV32-NEXT:    addi a2, a2, 1639
+; LMULMAX1-RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; LMULMAX1-RV32-NEXT:    vmv.v.x v27, a2
+; LMULMAX1-RV32-NEXT:    lui a2, 629146
+; LMULMAX1-RV32-NEXT:    addi a2, a2, -1639
+; LMULMAX1-RV32-NEXT:    vmerge.vxm v27, v27, a2, v0
+; LMULMAX1-RV32-NEXT:    vmulh.vv v26, v26, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v28, v26, 31
 ; LMULMAX1-RV32-NEXT:    vsra.vi v26, v26, 1
-; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v27
-; LMULMAX1-RV32-NEXT:    vse32.v v26, (a0)
-; LMULMAX1-RV32-NEXT:    vse32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    vadd.vv v26, v26, v28
+; LMULMAX1-RV32-NEXT:    vmulh.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vsrl.vi v27, v25, 31
+; LMULMAX1-RV32-NEXT:    vsra.vi v25, v25, 1
+; LMULMAX1-RV32-NEXT:    vadd.vv v25, v25, v27
+; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
+; LMULMAX1-RV32-NEXT:    vse32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: mulhs_v8i32:
 ; LMULMAX1-RV64:       # %bb.0:
 ; LMULMAX1-RV64-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
-; LMULMAX1-RV64-NEXT:    lui a2, %hi(.LCPI135_0)
-; LMULMAX1-RV64-NEXT:    addi a2, a2, %lo(.LCPI135_0)
-; LMULMAX1-RV64-NEXT:    vle32.v v26, (a2)
-; LMULMAX1-RV64-NEXT:    vle32.v v27, (a0)
-; LMULMAX1-RV64-NEXT:    vdiv.vv v25, v25, v26
-; LMULMAX1-RV64-NEXT:    vdiv.vv v26, v27, v26
-; LMULMAX1-RV64-NEXT:    vse32.v v26, (a0)
-; LMULMAX1-RV64-NEXT:    vse32.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    vle32.v v26, (a1)
+; LMULMAX1-RV64-NEXT:    addi a2, zero, 5
+; LMULMAX1-RV64-NEXT:    vsetivli a3, 1, e8,m1,ta,mu
+; LMULMAX1-RV64-NEXT:    vmv.s.x v0, a2
+; LMULMAX1-RV64-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-RV64-NEXT:    vmv.v.i v27, 5
+; LMULMAX1-RV64-NEXT:    vmerge.vim v27, v27, -5, v0
+; LMULMAX1-RV64-NEXT:    vdiv.vv v26, v26, v27
+; LMULMAX1-RV64-NEXT:    vdiv.vv v25, v25, v27
+; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
+; LMULMAX1-RV64-NEXT:    vse32.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %x
   %b = sdiv <8 x i32> %a, <i32 -5, i32 5, i32 -5, i32 5, i32 -5, i32 5, i32 -5, i32 5>
@@ -4309,32 +4695,43 @@ define void @mulhs_v4i64(<4 x i64>* %x) {
 ; LMULMAX2-RV32:       # %bb.0:
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI136_0)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI136_0)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 51
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v28, -1
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 0, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vmul.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI136_1)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI136_1)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 17
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    lui a1, 349525
+; LMULMAX2-RV32-NEXT:    addi a2, a1, 1365
+; LMULMAX2-RV32-NEXT:    vsetivli a3, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.x v30, a2
+; LMULMAX2-RV32-NEXT:    addi a1, a1, 1366
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v30, v30, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vmulh.vv v26, v26, v30
 ; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI136_2)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI136_2)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v28, (a1)
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 85
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.v.i v28, 0
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 63
+; LMULMAX2-RV32-NEXT:    vmerge.vxm v30, v28, a1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsrl.vv v28, v26, v28
-; LMULMAX2-RV32-NEXT:    lui a1, %hi(.LCPI136_3)
-; LMULMAX2-RV32-NEXT:    addi a1, a1, %lo(.LCPI136_3)
-; LMULMAX2-RV32-NEXT:    vsetivli a2, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v30, (a1)
+; LMULMAX2-RV32-NEXT:    vsrl.vv v30, v26, v30
+; LMULMAX2-RV32-NEXT:    addi a1, zero, 68
+; LMULMAX2-RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV32-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-RV32-NEXT:    vmerge.vim v28, v28, 1, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vsra.vv v26, v26, v30
-; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vsra.vv v26, v26, v28
+; LMULMAX2-RV32-NEXT:    vadd.vv v26, v26, v30
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
 ;
@@ -4342,20 +4739,30 @@ define void @mulhs_v4i64(<4 x i64>* %x) {
 ; LMULMAX2-RV64:       # %bb.0:
 ; LMULMAX2-RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle64.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI136_0)
-; LMULMAX2-RV64-NEXT:    addi a1, a1, %lo(.LCPI136_0)
-; LMULMAX2-RV64-NEXT:    vle64.v v28, (a1)
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI136_1)
-; LMULMAX2-RV64-NEXT:    addi a1, a1, %lo(.LCPI136_1)
-; LMULMAX2-RV64-NEXT:    vle64.v v30, (a1)
+; LMULMAX2-RV64-NEXT:    addi a1, zero, 5
+; LMULMAX2-RV64-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; LMULMAX2-RV64-NEXT:    vmv.v.i v28, -1
+; LMULMAX2-RV64-NEXT:    vmerge.vim v28, v28, 0, v0
 ; LMULMAX2-RV64-NEXT:    vmul.vv v28, v26, v28
+; LMULMAX2-RV64-NEXT:    lui a1, 21845
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
+; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
+; LMULMAX2-RV64-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
+; LMULMAX2-RV64-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV64-NEXT:    slli a1, a1, 12
+; LMULMAX2-RV64-NEXT:    addi a2, a1, 1365
+; LMULMAX2-RV64-NEXT:    vmv.v.x v30, a2
+; LMULMAX2-RV64-NEXT:    addi a1, a1, 1366
+; LMULMAX2-RV64-NEXT:    vmerge.vxm v30, v30, a1, v0
 ; LMULMAX2-RV64-NEXT:    vmulh.vv v26, v26, v30
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI136_2)
-; LMULMAX2-RV64-NEXT:    addi a1, a1, %lo(.LCPI136_2)
-; LMULMAX2-RV64-NEXT:    vle64.v v30, (a1)
 ; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v28
 ; LMULMAX2-RV64-NEXT:    addi a1, zero, 63
 ; LMULMAX2-RV64-NEXT:    vsrl.vx v28, v26, a1
+; LMULMAX2-RV64-NEXT:    vmv.v.i v30, 1
+; LMULMAX2-RV64-NEXT:    vmerge.vim v30, v30, 0, v0
 ; LMULMAX2-RV64-NEXT:    vsra.vv v26, v26, v30
 ; LMULMAX2-RV64-NEXT:    vadd.vv v26, v26, v28
 ; LMULMAX2-RV64-NEXT:    vse64.v v26, (a0)
@@ -5304,10 +5711,12 @@ define void @add_iv_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI160_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI160_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vadd.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -5556,10 +5965,12 @@ define void @sub_iv_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI174_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI174_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vsub.vv v25, v26, v25
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -5825,10 +6236,12 @@ define void @and_vi_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI190_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI190_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, -1
+; RV32-NEXT:    vmerge.vim v26, v26, -2, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vand.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -5902,10 +6315,12 @@ define void @and_iv_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI194_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI194_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vand.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -6075,10 +6490,12 @@ define void @or_vi_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI204_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI204_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, -1
+; RV32-NEXT:    vmerge.vim v26, v26, -2, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vor.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -6152,10 +6569,12 @@ define void @or_iv_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI208_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI208_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vor.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -6400,10 +6819,12 @@ define void @xor_iv_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI222_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI222_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vxor.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -6573,10 +6994,13 @@ define void @lshr_vi_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI232_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI232_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    addi a1, zero, 31
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vsrl.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -6698,10 +7122,13 @@ define void @ashr_vi_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI239_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI239_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    addi a1, zero, 31
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vsra.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -6823,10 +7250,13 @@ define void @shl_vi_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI246_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI246_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    addi a1, zero, 31
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vsll.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -7170,16 +7600,20 @@ define void @mulhu_vx_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI265_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI265_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    lui a1, 699051
+; RV32-NEXT:    addi a2, a1, -1366
+; RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.x v26, a2
+; RV32-NEXT:    addi a1, a1, -1365
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vmulhu.vv v25, v25, v26
-; RV32-NEXT:    lui a1, %hi(.LCPI265_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI265_1)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vsrl.vv v25, v25, v26
 ; RV32-NEXT:    vse64.v v25, (a0)
@@ -7292,16 +7726,21 @@ define void @mulhs_vx_v2i64(<2 x i64>* %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vle64.v v25, (a0)
-; RV32-NEXT:    lui a1, %hi(.LCPI269_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI269_0)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    addi a1, zero, 5
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a2, a1, 1365
+; RV32-NEXT:    vsetivli a3, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.x v26, a2
+; RV32-NEXT:    addi a1, a1, 1366
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vmulh.vv v25, v25, v26
-; RV32-NEXT:    lui a1, %hi(.LCPI269_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI269_1)
-; RV32-NEXT:    vsetivli a2, 4, e32,m1,ta,mu
-; RV32-NEXT:    vle32.v v26, (a1)
+; RV32-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    addi a1, zero, 63
+; RV32-NEXT:    vmerge.vxm v26, v26, a1, v0
 ; RV32-NEXT:    vsetivli a1, 2, e64,m1,ta,mu
 ; RV32-NEXT:    vsrl.vv v26, v25, v26
 ; RV32-NEXT:    vadd.vv v25, v25, v26

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index e7ea8535ff4e2..32f4c270b8bad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -1035,17 +1035,22 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1
 define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i8_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf8 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI49_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI49_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i64:
@@ -1066,17 +1071,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8
 define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf8 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI50_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI50_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i64:
@@ -1122,17 +1132,22 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x
 define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i16_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf4 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI52_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI52_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_sext_v8i16_v8i64:
@@ -1153,17 +1168,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(i64* %base, <8 x i16> %idxs,
 define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i16_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf4 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI53_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI53_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_zext_v8i16_v8i64:
@@ -1208,17 +1228,22 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x
 define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i32_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf2 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI55_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI55_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_sext_v8i32_v8i64:
@@ -1239,17 +1264,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(i64* %base, <8 x i32> %idxs,
 define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i32_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf2 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI56_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI56_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_zext_v8i32_v8i64:
@@ -1270,13 +1300,18 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(i64* %base, <8 x i32> %idxs,
 define <8 x i64> @mgather_baseidx_v8i64(i64* %base, <8 x i64> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, %hi(.LCPI57_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI57_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vmv1r.v v25, v0
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v28, 0
+; RV32-NEXT:    vmerge.vim v28, v28, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v8, v28
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
@@ -1938,17 +1973,22 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(double* %base, <8 x i8> %idxs, <
 define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i8_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf8 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI88_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI88_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f64:
@@ -1969,17 +2009,22 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(double* %base, <8 x i8> %id
 define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf8 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI89_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI89_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f64:
@@ -2025,17 +2070,22 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(double* %base, <8 x i16> %idxs,
 define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i16_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf4 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI91_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI91_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_sext_v8i16_v8f64:
@@ -2056,17 +2106,22 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(double* %base, <8 x i16> %
 define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i16_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf4 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI92_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI92_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_zext_v8i16_v8f64:
@@ -2111,17 +2166,22 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(double* %base, <8 x i32> %idxs,
 define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i32_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf2 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI94_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI94_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_sext_v8i32_v8f64:
@@ -2142,17 +2202,22 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(double* %base, <8 x i32> %
 define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i32_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf2 v28, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI95_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI95_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v16, (a1)
-; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
-; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsll.vv v28, v28, v8
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
-; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mgather_baseidx_zext_v8i32_v8f64:
@@ -2173,13 +2238,18 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(double* %base, <8 x i32> %
 define <8 x double> @mgather_baseidx_v8f64(double* %base, <8 x i64> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, %hi(.LCPI96_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI96_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vmv1r.v v25, v0
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v28, 0
+; RV32-NEXT:    vmerge.vim v28, v28, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v8, v28
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 7138a07efa766..4aee4d65147b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -857,15 +857,20 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %i
 define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf8 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI43_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI43_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -886,15 +891,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i
 define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf8 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI44_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI44_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -938,15 +948,20 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16>
 define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf4 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI46_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI46_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -967,15 +982,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x
 define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf4 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI47_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI47_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1018,15 +1038,20 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32>
 define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf2 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI49_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI49_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1047,15 +1072,20 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x
 define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf2 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI50_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI50_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1076,13 +1106,18 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x
 define void @mscatter_baseidx_v8i64(<8 x i64> %val, i64* %base, <8 x i64> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, %hi(.LCPI51_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI51_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vmv1r.v v25, v0
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v28, 0
+; RV32-NEXT:    vmerge.vim v28, v28, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v12, v28
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1688,15 +1723,20 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, double* %base, <8 x
 define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf8 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI82_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI82_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1717,15 +1757,20 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, double* %base,
 define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf8 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI83_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI83_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1769,15 +1814,20 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, double* %base, <8 x
 define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf4 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI85_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI85_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1798,15 +1848,20 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, double* %base,
 define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf4 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI86_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI86_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1849,15 +1904,20 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, double* %base, <8 x
 define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i32_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsext.vf2 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI88_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI88_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1878,15 +1938,20 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, double* %base,
 define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i32_v8f64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vzext.vf2 v28, v12
-; RV32-NEXT:    lui a1, %hi(.LCPI89_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI89_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v28, v12
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -1907,13 +1972,18 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, double* %base,
 define void @mscatter_baseidx_v8f64(<8 x double> %val, double* %base, <8 x i64> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, %hi(.LCPI90_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI90_0)
-; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vmv1r.v v25, v0
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli a2, 1, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v28, 0
+; RV32-NEXT:    vmerge.vim v28, v28, 3, v0
 ; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
 ; RV32-NEXT:    vsll.vv v28, v12, v28
 ; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index dd8e3e2d8f3c1..7a77666819953 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -699,14 +699,16 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV32MV-NEXT:    call __moddi3 at plt
 ; RV32MV-NEXT:    sw a1, 12(sp)
 ; RV32MV-NEXT:    sw a0, 8(sp)
+; RV32MV-NEXT:    addi a0, zero, 85
+; RV32MV-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32MV-NEXT:    vmv.s.x v0, a0
+; RV32MV-NEXT:    vsetivli a0, 8, e32,m2,ta,mu
+; RV32MV-NEXT:    vmv.v.i v26, 1
+; RV32MV-NEXT:    vle32.v v28, (sp)
 ; RV32MV-NEXT:    lui a0, %hi(.LCPI3_0)
 ; RV32MV-NEXT:    addi a0, a0, %lo(.LCPI3_0)
-; RV32MV-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
-; RV32MV-NEXT:    vle32.v v26, (a0)
-; RV32MV-NEXT:    vle32.v v28, (sp)
-; RV32MV-NEXT:    lui a0, %hi(.LCPI3_1)
-; RV32MV-NEXT:    addi a0, a0, %lo(.LCPI3_1)
 ; RV32MV-NEXT:    vle32.v v30, (a0)
+; RV32MV-NEXT:    vmerge.vim v26, v26, -1, v0
 ; RV32MV-NEXT:    vand.vv v26, v28, v26
 ; RV32MV-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
 ; RV32MV-NEXT:    vmsne.vv v0, v26, v30


        


More information about the llvm-commits mailing list