[llvm-branch-commits] [llvm] [LoongArch] Perform DAG combine for MUL to generate `[x]vmulw{ev/od}` (PR #161368)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Oct 9 04:43:34 PDT 2025


https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/161368

>From ba68f214a72e8867718e7624b62b21c32e19a98d Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Tue, 30 Sep 2025 20:53:06 +0800
Subject: [PATCH 1/3] [LoongArch] Perform DAG combine for MUL to generate
 `[x]vmulw{ev/od}`

---
 .../LoongArch/LoongArchISelLowering.cpp       |  118 +
 .../Target/LoongArch/LoongArchISelLowering.h  |   11 +-
 .../LoongArch/LoongArchLASXInstrInfo.td       |   41 +
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |   48 +
 .../lasx/ir-instruction/mulwev_od.ll          | 2268 +----------------
 .../LoongArch/lsx/ir-instruction/mulwev_od.ll |  186 +-
 6 files changed, 373 insertions(+), 2299 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7ddf996f53f4c..2763cef394620 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -462,6 +462,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasExtLSX()) {
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
     setTargetDAGCombine(ISD::BITCAST);
+    setTargetDAGCombine(ISD::MUL);
   }
 
   // Set DAG combine for 'LASX' feature.
@@ -6679,6 +6680,115 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT ResTy = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (ResTy != MVT::v8i16 && ResTy != MVT::v4i32 && ResTy != MVT::v2i64 &&
+      ResTy != MVT::v16i16 && ResTy != MVT::v8i32 &&
+      ResTy != MVT::v4i64) // && ResTy != MVT::v2i128)
+    return SDValue();
+
+  // Combine:
+  //   ti,tii,...,tx = extract_vector_elt t0, {0,2,4,.../1,3,5,...}
+  //   tj,tjj,...,ty = extract_vector_elt t1, {0,2,4,.../1,3,5,...}
+  //   tm = BUILD_VECTOR ti,tii,...,tx
+  //   tn = BUILD_VECTOR tj,tjj,...,ty
+  //   ta = {sign/zero}_extend tm
+  //   tb = {sign/zero}_extend tn
+  //   tr = mul ta, tb
+  // to:
+  //   tr = VMULW{EV/OD}[U/US] t0, t1
+  auto getExtType = [](unsigned Op0, unsigned Op1) -> unsigned {
+    if (Op0 == ISD::SIGN_EXTEND && Op1 == ISD::SIGN_EXTEND)
+      return 0;
+    if (Op0 == ISD::ZERO_EXTEND && Op1 == ISD::ZERO_EXTEND)
+      return 1;
+    if (Op0 == ISD::ZERO_EXTEND && Op1 == ISD::SIGN_EXTEND)
+      return 2;
+    if (Op0 == ISD::SIGN_EXTEND && Op1 == ISD::ZERO_EXTEND)
+      return 3;
+    return -1;
+  };
+
+  unsigned ExtType = getExtType(N0.getOpcode(), N1.getOpcode());
+  if (ExtType < 0)
+    return SDValue();
+
+  SDValue BV0 = N0.getOperand(0);
+  SDValue BV1 = N1.getOperand(0);
+  if (BV0.getOpcode() != ISD::BUILD_VECTOR ||
+      BV1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  unsigned ResBits = ResTy.getScalarType().getSizeInBits();
+  unsigned BV0Bits = BV0.getValueType().getScalarType().getSizeInBits();
+  unsigned BV1Bits = BV1.getValueType().getScalarType().getSizeInBits();
+  if (BV0Bits != BV1Bits || ResBits != BV0Bits * 2)
+    return SDValue();
+
+  unsigned Index;
+  SDValue OrigN0, OrigN1;
+  for (unsigned i = 0; i < BV0.getNumOperands(); ++i) {
+    SDValue Op0 = BV0.getOperand(i);
+    SDValue Op1 = BV1.getOperand(i);
+    // Each element of BUILD_VECTOR must be EXTRACT_VECTOR_ELT.
+    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Check each EXTRACT_VECTOR_ELT's source vector and index.
+    if (Op0.getOperand(1) != Op1.getOperand(1))
+      return SDValue();
+
+    auto *IdxC = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+    if (!IdxC)
+      return SDValue();
+    unsigned CurIdx = IdxC->getZExtValue();
+
+    if (i == 0) {
+      if (CurIdx != 0 && CurIdx != 1)
+        return SDValue();
+      OrigN0 = Op0.getOperand(0);
+      OrigN1 = Op1.getOperand(0);
+    } else {
+      if (CurIdx != Index + 2)
+        return SDValue();
+      if (Op0.getOperand(0) != OrigN0 || Op1.getOperand(0) != OrigN1)
+        return SDValue();
+    }
+    Index = CurIdx;
+  }
+
+  if (OrigN0.getValueType() != OrigN1.getValueType())
+    return SDValue();
+  if (OrigN0.getValueType().getVectorNumElements() !=
+      ResTy.getVectorNumElements() * 2)
+    return SDValue();
+
+  SDValue Result;
+  EVT OrigTy = OrigN0.getValueType();
+  bool IsEven = (Index % 2 == 0);
+
+  static const unsigned OpcTable[3][2] = {
+      {LoongArchISD::VMULWOD, LoongArchISD::VMULWEV},
+      {LoongArchISD::VMULWODU, LoongArchISD::VMULWEVU},
+      {LoongArchISD::VMULWODUS, LoongArchISD::VMULWEVUS}};
+
+  if (ExtType == 3)
+    Result = DAG.getNode(OpcTable[2][IsEven], DL, OrigTy, OrigN1, OrigN0);
+  else
+    Result = DAG.getNode(OpcTable[ExtType][IsEven], DL, OrigTy, OrigN0, OrigN1);
+
+  return DAG.getBitcast(ResTy, Result);
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6714,6 +6824,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget);
   case ISD::EXTRACT_VECTOR_ELT:
     return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMULCombine(N, DAG, DCI);
   }
   return SDValue();
 }
@@ -7526,6 +7638,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(XVMSKEQZ)
     NODE_NAME_CASE(XVMSKNEZ)
     NODE_NAME_CASE(VHADDW)
+    NODE_NAME_CASE(VMULWEV)
+    NODE_NAME_CASE(VMULWOD)
+    NODE_NAME_CASE(VMULWEVU)
+    NODE_NAME_CASE(VMULWODU)
+    NODE_NAME_CASE(VMULWEVUS)
+    NODE_NAME_CASE(VMULWODUS)
   }
 #undef NODE_NAME_CASE
   return nullptr;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 8a4d7748467c7..1e5632eb00f7b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -189,7 +189,16 @@ enum NodeType : unsigned {
   XVMSKNEZ,
 
   // Vector Horizontal Addition with Widening‌
-  VHADDW
+  VHADDW,
+
+  // Perform element-wise vector multiplication at even/odd indices,
+  // and keep each result in its corresponding widened slot
+  VMULWEV,
+  VMULWOD,
+  VMULWEVU,
+  VMULWODU,
+  VMULWEVUS,
+  VMULWODUS
 
   // Intrinsic operations end =============================================
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5143d53bad719..7c28efd88ae09 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1328,6 +1328,39 @@ multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> {
   }
 }
 
+multiclass XVmulwPat<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_H_B") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_W_H") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_D_W") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_Q_D") LASX256:$xj, LASX256:$xk)>;
+}
+
+multiclass XVmulwuPat<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_H_BU") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_W_HU") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_D_WU") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_Q_DU") LASX256:$xj, LASX256:$xk)>;
+}
+
+multiclass XVmulwusPat<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_H_BU_B") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_W_HU_H") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_D_WU_W") LASX256:$xj, LASX256:$xk)>;
+  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
+            (!cast<LAInst>(Inst#"_Q_DU_D") LASX256:$xj, LASX256:$xk)>;
+}
+
 let Predicates = [HasExtLASX] in {
 
 // XVADD_{B/H/W/D}
@@ -1365,6 +1398,14 @@ defm : PatXrXr<mul, "XVMUL">;
 defm : PatXrXr<mulhs, "XVMUH">;
 defm : PatXrXrU<mulhu, "XVMUH">;
 
+// XVMULW{EV/OD}_{H_B/W_H/D_W/Q_D}[U], XVMULW{EV/OD}_{H_BU_B/W_HU_H/D_WU_W/Q_DU_D}
+defm : XVmulwPat<loongarch_vmulwev, "XVMULWEV">;
+defm : XVmulwPat<loongarch_vmulwod, "XVMULWOD">;
+defm : XVmulwuPat<loongarch_vmulwevu, "XVMULWEV">;
+defm : XVmulwuPat<loongarch_vmulwodu, "XVMULWOD">;
+defm : XVmulwusPat<loongarch_vmulwevus, "XVMULWEV">;
+defm : XVmulwusPat<loongarch_vmulwodus, "XVMULWOD">;
+
 // XVMADD_{B/H/W/D}
 defm : PatXrXrXr<muladd, "XVMADD">;
 // XVMSUB_{B/H/W/D}
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 8d1dc99e316c9..e34f6d7e58610 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -82,6 +82,13 @@ def loongarch_vmskgez: SDNode<"LoongArchISD::VMSKGEZ", SDT_LoongArchVMSKCOND>;
 def loongarch_vmskeqz: SDNode<"LoongArchISD::VMSKEQZ", SDT_LoongArchVMSKCOND>;
 def loongarch_vmsknez: SDNode<"LoongArchISD::VMSKNEZ", SDT_LoongArchVMSKCOND>;
 
+def loongarch_vmulwev: SDNode<"LoongArchISD::VMULWEV", SDT_LoongArchV2R>;
+def loongarch_vmulwod: SDNode<"LoongArchISD::VMULWOD", SDT_LoongArchV2R>;
+def loongarch_vmulwevu: SDNode<"LoongArchISD::VMULWEVU", SDT_LoongArchV2R>;
+def loongarch_vmulwodu: SDNode<"LoongArchISD::VMULWODU", SDT_LoongArchV2R>;
+def loongarch_vmulwevus: SDNode<"LoongArchISD::VMULWEVUS", SDT_LoongArchV2R>;
+def loongarch_vmulwodus: SDNode<"LoongArchISD::VMULWODUS", SDT_LoongArchV2R>;
+
 def immZExt1 : ImmLeaf<GRLenVT, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<GRLenVT, [{return isUInt<2>(Imm);}]>;
 def immZExt3 : ImmLeaf<GRLenVT, [{return isUInt<3>(Imm);}]>;
@@ -1518,6 +1525,39 @@ multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
   }
 }
 
+multiclass VmulwPat<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_H_B") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_W_H") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_D_W") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_Q_D") LSX128:$vj, LSX128:$vk)>;
+}
+
+multiclass VmulwuPat<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_H_BU") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_W_HU") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_D_WU") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_Q_DU") LSX128:$vj, LSX128:$vk)>;
+}
+
+multiclass VmulwusPat<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_H_BU_B") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_W_HU_H") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_D_WU_W") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_Q_DU_D") LSX128:$vj, LSX128:$vk)>;
+}
+
 let Predicates = [HasExtLSX] in {
 
 // VADD_{B/H/W/D}
@@ -1555,6 +1595,14 @@ defm : PatVrVr<mul, "VMUL">;
 defm : PatVrVr<mulhs, "VMUH">;
 defm : PatVrVrU<mulhu, "VMUH">;
 
+// VMULW{EV/OD}_{H_B/W_H/D_W/Q_D}[U], VMULW{EV/OD}_{H_BU_B/W_HU_H/D_WU_W/Q_DU_D}
+defm : VmulwPat<loongarch_vmulwev, "VMULWEV">;
+defm : VmulwPat<loongarch_vmulwod, "VMULWOD">;
+defm : VmulwuPat<loongarch_vmulwevu, "VMULWEV">;
+defm : VmulwuPat<loongarch_vmulwodu, "VMULWOD">;
+defm : VmulwusPat<loongarch_vmulwevus, "VMULWEV">;
+defm : VmulwusPat<loongarch_vmulwodus, "VMULWOD">;
+
 // VMADD_{B/H/W/D}
 defm : PatVrVrVr<muladd, "VMADD">;
 // VMSUB_{B/H/W/D}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll
index c8796b839913c..605325f4dc4f4 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll
@@ -5,109 +5,9 @@
 define void @vmulwev_h_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_b:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.h.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -127,59 +27,7 @@ define void @vmulwev_w_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; CHECK-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; CHECK-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; CHECK-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; CHECK-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; CHECK-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; CHECK-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; CHECK-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; CHECK-NEXT:    xvpermi.d $xr0, $xr1, 14
-; CHECK-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; CHECK-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; CHECK-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; CHECK-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; CHECK-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; CHECK-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; CHECK-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; CHECK-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; CHECK-NEXT:    ext.w.h $t0, $t0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; CHECK-NEXT:    ext.w.h $a7, $a7
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; CHECK-NEXT:    ext.w.h $a6, $a6
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; CHECK-NEXT:    ext.w.h $a5, $a5
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; CHECK-NEXT:    ext.w.h $a4, $a4
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; CHECK-NEXT:    ext.w.h $a3, $a3
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; CHECK-NEXT:    ext.w.h $a2, $a2
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; CHECK-NEXT:    ext.w.h $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ext.w.h $a1, $t8
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT:    ext.w.h $a1, $t7
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; CHECK-NEXT:    ext.w.h $a1, $t6
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; CHECK-NEXT:    ext.w.h $a1, $t5
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT:    ext.w.h $a1, $t4
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; CHECK-NEXT:    ext.w.h $a1, $t3
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; CHECK-NEXT:    ext.w.h $a1, $t2
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; CHECK-NEXT:    ext.w.h $a1, $t1
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; CHECK-NEXT:    xvpermi.q $xr1, $xr2, 2
-; CHECK-NEXT:    xvmul.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvmulwev.w.h $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -195,73 +43,13 @@ entry:
 }
 
 define void @vmulwev_d_w(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_d_w:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 2
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr1, 0
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 6
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 4
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 0
-; LA32-NEXT:    srai.w $a4, $a4, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 2
-; LA32-NEXT:    srai.w $a3, $a3, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 3
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT:    srai.w $a2, $a2, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    srai.w $a1, $a1, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    srai.w $a1, $t0, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 2
-; LA32-NEXT:    srai.w $a1, $a7, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a6, 0
-; LA32-NEXT:    srai.w $a1, $a6, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a5, 2
-; LA32-NEXT:    srai.w $a1, $a5, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA32-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_d_w:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 0
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 6
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 4
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.d $vr0, $t0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a7, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a6, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a5, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_d_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.d.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -423,109 +211,9 @@ entry:
 define void @vmulwod_h_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwod_h_b:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 15
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.h.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -545,59 +233,7 @@ define void @vmulwod_w_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; CHECK-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; CHECK-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; CHECK-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; CHECK-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; CHECK-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; CHECK-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; CHECK-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; CHECK-NEXT:    xvpermi.d $xr0, $xr1, 14
-; CHECK-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; CHECK-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; CHECK-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; CHECK-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; CHECK-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; CHECK-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; CHECK-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; CHECK-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; CHECK-NEXT:    ext.w.h $t0, $t0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; CHECK-NEXT:    ext.w.h $a7, $a7
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; CHECK-NEXT:    ext.w.h $a6, $a6
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; CHECK-NEXT:    ext.w.h $a5, $a5
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; CHECK-NEXT:    ext.w.h $a4, $a4
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; CHECK-NEXT:    ext.w.h $a3, $a3
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; CHECK-NEXT:    ext.w.h $a2, $a2
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; CHECK-NEXT:    ext.w.h $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ext.w.h $a1, $t8
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT:    ext.w.h $a1, $t7
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; CHECK-NEXT:    ext.w.h $a1, $t6
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; CHECK-NEXT:    ext.w.h $a1, $t5
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT:    ext.w.h $a1, $t4
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; CHECK-NEXT:    ext.w.h $a1, $t3
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; CHECK-NEXT:    ext.w.h $a1, $t2
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; CHECK-NEXT:    ext.w.h $a1, $t1
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; CHECK-NEXT:    xvpermi.q $xr1, $xr2, 2
-; CHECK-NEXT:    xvmul.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvmulwod.w.h $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -613,73 +249,13 @@ entry:
 }
 
 define void @vmulwod_d_w(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_d_w:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 3
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr1, 1
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 7
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 5
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 0
-; LA32-NEXT:    srai.w $a4, $a4, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 2
-; LA32-NEXT:    srai.w $a3, $a3, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 3
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT:    srai.w $a2, $a2, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    srai.w $a1, $a1, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    srai.w $a1, $t0, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 2
-; LA32-NEXT:    srai.w $a1, $a7, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a6, 0
-; LA32-NEXT:    srai.w $a1, $a6, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a5, 2
-; LA32-NEXT:    srai.w $a1, $a5, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA32-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_d_w:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 1
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 7
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 5
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.d $vr0, $t0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a7, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a6, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a5, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_d_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.d.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -841,109 +417,9 @@ entry:
 define void @vmulwev_h_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_bu:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.h.bu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -959,125 +435,13 @@ entry:
 }
 
 define void @vmulwev_w_hu(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_w_hu:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA32-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; LA32-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; LA32-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; LA32-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; LA32-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; LA32-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; LA32-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; LA32-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA32-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; LA32-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; LA32-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; LA32-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; LA32-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; LA32-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; LA32-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; LA32-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; LA32-NEXT:    bstrpick.w $t0, $t0, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    bstrpick.w $a7, $a7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA32-NEXT:    bstrpick.w $a6, $a6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA32-NEXT:    bstrpick.w $a5, $a5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA32-NEXT:    bstrpick.w $a4, $a4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    bstrpick.w $a3, $a3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA32-NEXT:    bstrpick.w $a2, $a2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t8, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    bstrpick.w $a1, $t4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA32-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_w_hu:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA64-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; LA64-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; LA64-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; LA64-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; LA64-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; LA64-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; LA64-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; LA64-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA64-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; LA64-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; LA64-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; LA64-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; LA64-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; LA64-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; LA64-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; LA64-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; LA64-NEXT:    bstrpick.d $t0, $t0, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA64-NEXT:    bstrpick.d $a7, $a7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t8, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    bstrpick.d $a1, $t4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_w_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.w.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <16 x i16>, ptr %a
   %vb = load <16 x i16>, ptr %b
@@ -1091,63 +455,13 @@ entry:
 }
 
 define void @vmulwev_d_wu(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_d_wu:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvrepli.b $xr1, 0
-; LA32-NEXT:    xvld $xr2, $a2, 0
-; LA32-NEXT:    xvori.b $xr3, $xr1, 0
-; LA32-NEXT:    xvinsve0.w $xr3, $xr0, 0
-; LA32-NEXT:    xvpickve.w $xr4, $xr0, 2
-; LA32-NEXT:    xvinsve0.w $xr3, $xr4, 2
-; LA32-NEXT:    xvpickve.w $xr4, $xr0, 4
-; LA32-NEXT:    xvinsve0.w $xr3, $xr4, 4
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 6
-; LA32-NEXT:    xvinsve0.w $xr3, $xr0, 6
-; LA32-NEXT:    xvinsve0.w $xr1, $xr2, 0
-; LA32-NEXT:    xvpickve.w $xr0, $xr2, 2
-; LA32-NEXT:    xvinsve0.w $xr1, $xr0, 2
-; LA32-NEXT:    xvpickve.w $xr0, $xr2, 4
-; LA32-NEXT:    xvinsve0.w $xr1, $xr0, 4
-; LA32-NEXT:    xvpickve.w $xr0, $xr2, 6
-; LA32-NEXT:    xvinsve0.w $xr1, $xr0, 6
-; LA32-NEXT:    xvmul.d $xr0, $xr3, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_d_wu:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 0
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 6
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 4
-; LA64-NEXT:    bstrpick.d $a4, $a4, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    bstrpick.d $a1, $t0, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a7, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $a6, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a5, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_d_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.d.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -1255,109 +569,9 @@ entry:
 define void @vmulwod_h_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwod_h_bu:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 15
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.h.bu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -1373,125 +587,13 @@ entry:
 }
 
 define void @vmulwod_w_hu(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_w_hu:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA32-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; LA32-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; LA32-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; LA32-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; LA32-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; LA32-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; LA32-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; LA32-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA32-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; LA32-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; LA32-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; LA32-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; LA32-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; LA32-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; LA32-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; LA32-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; LA32-NEXT:    bstrpick.w $t0, $t0, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    bstrpick.w $a7, $a7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA32-NEXT:    bstrpick.w $a6, $a6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA32-NEXT:    bstrpick.w $a5, $a5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA32-NEXT:    bstrpick.w $a4, $a4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    bstrpick.w $a3, $a3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA32-NEXT:    bstrpick.w $a2, $a2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t8, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    bstrpick.w $a1, $t4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA32-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_w_hu:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA64-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; LA64-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; LA64-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; LA64-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; LA64-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; LA64-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; LA64-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; LA64-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA64-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; LA64-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; LA64-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; LA64-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; LA64-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; LA64-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; LA64-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; LA64-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; LA64-NEXT:    bstrpick.d $t0, $t0, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA64-NEXT:    bstrpick.d $a7, $a7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t8, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    bstrpick.d $a1, $t4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_w_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.w.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <16 x i16>, ptr %a
   %vb = load <16 x i16>, ptr %b
@@ -1505,65 +607,13 @@ entry:
 }
 
 define void @vmulwod_d_wu(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_d_wu:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvrepli.b $xr2, 0
-; LA32-NEXT:    xvpickve.w $xr3, $xr0, 1
-; LA32-NEXT:    xvori.b $xr4, $xr2, 0
-; LA32-NEXT:    xvinsve0.w $xr4, $xr3, 0
-; LA32-NEXT:    xvpickve.w $xr3, $xr0, 3
-; LA32-NEXT:    xvinsve0.w $xr4, $xr3, 2
-; LA32-NEXT:    xvpickve.w $xr3, $xr0, 5
-; LA32-NEXT:    xvinsve0.w $xr4, $xr3, 4
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 7
-; LA32-NEXT:    xvinsve0.w $xr4, $xr0, 6
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 1
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 0
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 3
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 2
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 5
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 4
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 7
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 6
-; LA32-NEXT:    xvmul.d $xr0, $xr4, $xr2
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_d_wu:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 1
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 7
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 5
-; LA64-NEXT:    bstrpick.d $a4, $a4, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    bstrpick.d $a1, $t0, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a7, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $a6, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a5, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_d_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.d.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -1671,109 +721,9 @@ entry:
 define void @vmulwev_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_bu_b:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.h.bu.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -1789,125 +739,13 @@ entry:
 }
 
 define void @vmulwev_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_w_hu_h:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA32-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; LA32-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; LA32-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; LA32-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; LA32-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; LA32-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; LA32-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; LA32-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA32-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; LA32-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; LA32-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; LA32-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; LA32-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; LA32-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; LA32-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; LA32-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; LA32-NEXT:    bstrpick.w $t0, $t0, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    bstrpick.w $a7, $a7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA32-NEXT:    bstrpick.w $a6, $a6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA32-NEXT:    bstrpick.w $a5, $a5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA32-NEXT:    bstrpick.w $a4, $a4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    bstrpick.w $a3, $a3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA32-NEXT:    bstrpick.w $a2, $a2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    ext.w.h $a1, $t8
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA32-NEXT:    ext.w.h $a1, $t7
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    ext.w.h $a1, $t6
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    ext.w.h $a1, $t5
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    ext.w.h $a1, $t4
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    ext.w.h $a1, $t3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    ext.w.h $a1, $t2
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    ext.w.h $a1, $t1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA32-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_w_hu_h:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA64-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; LA64-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; LA64-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; LA64-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; LA64-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; LA64-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; LA64-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; LA64-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA64-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; LA64-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; LA64-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; LA64-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; LA64-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; LA64-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; LA64-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; LA64-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; LA64-NEXT:    bstrpick.d $t0, $t0, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA64-NEXT:    bstrpick.d $a7, $a7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    ext.w.h $a1, $t8
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    ext.w.h $a1, $t7
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    ext.w.h $a1, $t6
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    ext.w.h $a1, $t5
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    ext.w.h $a1, $t4
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    ext.w.h $a1, $t3
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    ext.w.h $a1, $t2
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    ext.w.h $a1, $t1
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_w_hu_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.w.hu.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <16 x i16>, ptr %a
   %vb = load <16 x i16>, ptr %b
@@ -1921,68 +759,13 @@ entry:
 }
 
 define void @vmulwev_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_d_wu_w:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a2, 0
-; LA32-NEXT:    xvld $xr1, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA32-NEXT:    xvrepli.b $xr0, 0
-; LA32-NEXT:    xvinsve0.w $xr0, $xr1, 0
-; LA32-NEXT:    xvpickve.w $xr2, $xr1, 2
-; LA32-NEXT:    xvinsve0.w $xr0, $xr2, 2
-; LA32-NEXT:    xvpickve.w $xr2, $xr1, 4
-; LA32-NEXT:    xvinsve0.w $xr0, $xr2, 4
-; LA32-NEXT:    xvpickve.w $xr1, $xr1, 6
-; LA32-NEXT:    xvinsve0.w $xr0, $xr1, 6
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    srai.w $a4, $a4, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 2
-; LA32-NEXT:    srai.w $a3, $a3, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 0
-; LA32-NEXT:    srai.w $a2, $a2, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    srai.w $a1, $a1, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA32-NEXT:    xvmul.d $xr0, $xr0, $xr2
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_d_wu_w:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 0
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 6
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 4
-; LA64-NEXT:    bstrpick.d $a4, $a4, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.d $vr0, $t0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a7, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a6, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a5, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_d_wu_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.d.wu.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -2122,109 +905,9 @@ entry:
 define void @vmulwod_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwod_h_bu_b:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 15
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.h.bu.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -2240,125 +923,13 @@ entry:
 }
 
 define void @vmulwod_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_w_hu_h:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA32-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; LA32-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; LA32-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; LA32-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; LA32-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; LA32-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; LA32-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; LA32-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA32-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; LA32-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; LA32-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; LA32-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; LA32-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; LA32-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; LA32-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; LA32-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; LA32-NEXT:    bstrpick.w $t0, $t0, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    bstrpick.w $a7, $a7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA32-NEXT:    bstrpick.w $a6, $a6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA32-NEXT:    bstrpick.w $a5, $a5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA32-NEXT:    bstrpick.w $a4, $a4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    bstrpick.w $a3, $a3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA32-NEXT:    bstrpick.w $a2, $a2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    ext.w.h $a1, $t8
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA32-NEXT:    ext.w.h $a1, $t7
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    ext.w.h $a1, $t6
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    ext.w.h $a1, $t5
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    ext.w.h $a1, $t4
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    ext.w.h $a1, $t3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    ext.w.h $a1, $t2
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    ext.w.h $a1, $t1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA32-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_w_hu_h:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA64-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; LA64-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; LA64-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; LA64-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; LA64-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; LA64-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; LA64-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; LA64-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA64-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; LA64-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; LA64-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; LA64-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; LA64-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; LA64-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; LA64-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; LA64-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; LA64-NEXT:    bstrpick.d $t0, $t0, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA64-NEXT:    bstrpick.d $a7, $a7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    ext.w.h $a1, $t8
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    ext.w.h $a1, $t7
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    ext.w.h $a1, $t6
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    ext.w.h $a1, $t5
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    ext.w.h $a1, $t4
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    ext.w.h $a1, $t3
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    ext.w.h $a1, $t2
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    ext.w.h $a1, $t1
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_w_hu_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.w.hu.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <16 x i16>, ptr %a
   %vb = load <16 x i16>, ptr %b
@@ -2372,69 +943,13 @@ entry:
 }
 
 define void @vmulwod_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_d_wu_w:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a2, 0
-; LA32-NEXT:    xvld $xr1, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 1
-; LA32-NEXT:    xvrepli.b $xr2, 0
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 0
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 3
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 2
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 5
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 4
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 7
-; LA32-NEXT:    xvinsve0.w $xr2, $xr0, 6
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 0
-; LA32-NEXT:    srai.w $a4, $a4, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 2
-; LA32-NEXT:    srai.w $a3, $a3, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 3
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT:    srai.w $a2, $a2, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    srai.w $a1, $a1, 31
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA32-NEXT:    xvmul.d $xr0, $xr2, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_d_wu_w:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 1
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 7
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 5
-; LA64-NEXT:    bstrpick.d $a4, $a4, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    bstrpick.d $a3, $a3, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    bstrpick.d $a2, $a2, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.d $vr0, $t0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a7, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a6, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a5, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_d_wu_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.d.wu.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -2574,109 +1089,9 @@ entry:
 define void @vmulwev_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_bu_b_1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 0
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 2
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 4
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 6
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 8
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 10
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 12
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 14
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.h.bu.b $xr0, $xr1, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -2692,125 +1107,13 @@ entry:
 }
 
 define void @vmulwev_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_w_hu_h_1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA32-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; LA32-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; LA32-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; LA32-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; LA32-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; LA32-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; LA32-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; LA32-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA32-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; LA32-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; LA32-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; LA32-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; LA32-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; LA32-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; LA32-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; LA32-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; LA32-NEXT:    ext.w.h $t0, $t0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    ext.w.h $a7, $a7
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA32-NEXT:    ext.w.h $a6, $a6
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA32-NEXT:    ext.w.h $a5, $a5
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA32-NEXT:    ext.w.h $a4, $a4
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    ext.w.h $a3, $a3
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA32-NEXT:    ext.w.h $a2, $a2
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA32-NEXT:    ext.w.h $a1, $a1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t8, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    bstrpick.w $a1, $t4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA32-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_w_hu_h_1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA64-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; LA64-NEXT:    vpickve2gr.h $a2, $vr2, 4
-; LA64-NEXT:    vpickve2gr.h $a3, $vr2, 2
-; LA64-NEXT:    vpickve2gr.h $a4, $vr2, 0
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 6
-; LA64-NEXT:    vpickve2gr.h $a6, $vr0, 4
-; LA64-NEXT:    vpickve2gr.h $a7, $vr0, 2
-; LA64-NEXT:    vpickve2gr.h $t0, $vr0, 0
-; LA64-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA64-NEXT:    vpickve2gr.h $t1, $vr0, 6
-; LA64-NEXT:    vpickve2gr.h $t2, $vr0, 4
-; LA64-NEXT:    vpickve2gr.h $t3, $vr0, 2
-; LA64-NEXT:    vpickve2gr.h $t4, $vr0, 0
-; LA64-NEXT:    vpickve2gr.h $t5, $vr1, 6
-; LA64-NEXT:    vpickve2gr.h $t6, $vr1, 4
-; LA64-NEXT:    vpickve2gr.h $t7, $vr1, 2
-; LA64-NEXT:    vpickve2gr.h $t8, $vr1, 0
-; LA64-NEXT:    ext.w.h $t0, $t0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA64-NEXT:    ext.w.h $a7, $a7
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA64-NEXT:    ext.w.h $a6, $a6
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA64-NEXT:    ext.w.h $a5, $a5
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA64-NEXT:    ext.w.h $a4, $a4
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA64-NEXT:    ext.w.h $a3, $a3
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA64-NEXT:    ext.w.h $a2, $a2
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA64-NEXT:    ext.w.h $a1, $a1
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t8, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    bstrpick.d $a1, $t4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_w_hu_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.w.hu.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <16 x i16>, ptr %a
   %vb = load <16 x i16>, ptr %b
@@ -2824,68 +1127,13 @@ entry:
 }
 
 define void @vmulwev_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_d_wu_w_1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 0
-; LA32-NEXT:    srai.w $a4, $a4, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 2
-; LA32-NEXT:    srai.w $a3, $a3, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 0
-; LA32-NEXT:    srai.w $a2, $a2, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    srai.w $a1, $a1, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA32-NEXT:    xvrepli.b $xr0, 0
-; LA32-NEXT:    xvinsve0.w $xr0, $xr1, 0
-; LA32-NEXT:    xvpickve.w $xr3, $xr1, 2
-; LA32-NEXT:    xvinsve0.w $xr0, $xr3, 2
-; LA32-NEXT:    xvpickve.w $xr3, $xr1, 4
-; LA32-NEXT:    xvinsve0.w $xr0, $xr3, 4
-; LA32-NEXT:    xvpickve.w $xr1, $xr1, 6
-; LA32-NEXT:    xvinsve0.w $xr0, $xr1, 6
-; LA32-NEXT:    xvmul.d $xr0, $xr2, $xr0
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_d_wu_w_1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 6
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 4
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 0
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 6
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 4
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    bstrpick.d $a1, $t0, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a7, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $a6, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a5, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_d_wu_w_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.d.wu.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -3025,109 +1273,9 @@ entry:
 define void @vmulwod_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwod_h_bu_b_1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr3, $a1, 0
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr2, $xr3, 14
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr3, 15
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    ext.w.b $a1, $a1
-; CHECK-NEXT:    vinsgr2vr.h $vr3, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr4, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 1
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 3
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 5
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 7
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 9
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 11
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 13
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr2, 15
-; CHECK-NEXT:    andi $a1, $a1, 255
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr1, $xr3, 2
-; CHECK-NEXT:    xvpermi.q $xr4, $xr0, 2
-; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr4
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.h.bu.b $xr0, $xr1, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -3143,125 +1291,13 @@ entry:
 }
 
 define void @vmulwod_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_w_hu_h_1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA32-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; LA32-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; LA32-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; LA32-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; LA32-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; LA32-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; LA32-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; LA32-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA32-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; LA32-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; LA32-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; LA32-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; LA32-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; LA32-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; LA32-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; LA32-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; LA32-NEXT:    ext.w.h $t0, $t0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA32-NEXT:    ext.w.h $a7, $a7
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA32-NEXT:    ext.w.h $a6, $a6
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA32-NEXT:    ext.w.h $a5, $a5
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA32-NEXT:    ext.w.h $a4, $a4
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA32-NEXT:    ext.w.h $a3, $a3
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA32-NEXT:    ext.w.h $a2, $a2
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA32-NEXT:    ext.w.h $a1, $a1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t8, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t7, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t6, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t5, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    bstrpick.w $a1, $t4, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    bstrpick.w $a1, $t3, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    bstrpick.w $a1, $t2, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    bstrpick.w $a1, $t1, 15, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA32-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_w_hu_h_1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpermi.d $xr2, $xr0, 14
-; LA64-NEXT:    vpickve2gr.h $a1, $vr2, 7
-; LA64-NEXT:    vpickve2gr.h $a2, $vr2, 5
-; LA64-NEXT:    vpickve2gr.h $a3, $vr2, 3
-; LA64-NEXT:    vpickve2gr.h $a4, $vr2, 1
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 7
-; LA64-NEXT:    vpickve2gr.h $a6, $vr0, 5
-; LA64-NEXT:    vpickve2gr.h $a7, $vr0, 3
-; LA64-NEXT:    vpickve2gr.h $t0, $vr0, 1
-; LA64-NEXT:    xvpermi.d $xr0, $xr1, 14
-; LA64-NEXT:    vpickve2gr.h $t1, $vr0, 7
-; LA64-NEXT:    vpickve2gr.h $t2, $vr0, 5
-; LA64-NEXT:    vpickve2gr.h $t3, $vr0, 3
-; LA64-NEXT:    vpickve2gr.h $t4, $vr0, 1
-; LA64-NEXT:    vpickve2gr.h $t5, $vr1, 7
-; LA64-NEXT:    vpickve2gr.h $t6, $vr1, 5
-; LA64-NEXT:    vpickve2gr.h $t7, $vr1, 3
-; LA64-NEXT:    vpickve2gr.h $t8, $vr1, 1
-; LA64-NEXT:    ext.w.h $t0, $t0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $t0, 0
-; LA64-NEXT:    ext.w.h $a7, $a7
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a7, 1
-; LA64-NEXT:    ext.w.h $a6, $a6
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a6, 2
-; LA64-NEXT:    ext.w.h $a5, $a5
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a5, 3
-; LA64-NEXT:    ext.w.h $a4, $a4
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 0
-; LA64-NEXT:    ext.w.h $a3, $a3
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 1
-; LA64-NEXT:    ext.w.h $a2, $a2
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a2, 2
-; LA64-NEXT:    ext.w.h $a1, $a1
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t8, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t7, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t6, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t5, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    bstrpick.d $a1, $t4, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $t3, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $t2, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    bstrpick.d $a1, $t1, 15, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvmul.w $xr0, $xr0, $xr1
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_w_hu_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.w.hu.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <16 x i16>, ptr %a
   %vb = load <16 x i16>, ptr %b
@@ -3275,69 +1311,13 @@ entry:
 }
 
 define void @vmulwod_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_d_wu_w_1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 0
-; LA32-NEXT:    srai.w $a4, $a4, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a4, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 2
-; LA32-NEXT:    srai.w $a3, $a3, 31
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a3, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 0
-; LA32-NEXT:    srai.w $a2, $a2, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    srai.w $a1, $a1, 31
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 1
-; LA32-NEXT:    xvrepli.b $xr3, 0
-; LA32-NEXT:    xvinsve0.w $xr3, $xr0, 0
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 3
-; LA32-NEXT:    xvinsve0.w $xr3, $xr0, 2
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 5
-; LA32-NEXT:    xvinsve0.w $xr3, $xr0, 4
-; LA32-NEXT:    xvpickve.w $xr0, $xr1, 7
-; LA32-NEXT:    xvinsve0.w $xr3, $xr0, 6
-; LA32-NEXT:    xvmul.d $xr0, $xr2, $xr3
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_d_wu_w_1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT:    xvpickve2gr.w $a5, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.w $a6, $xr1, 1
-; LA64-NEXT:    xvpickve2gr.w $a7, $xr1, 7
-; LA64-NEXT:    xvpickve2gr.w $t0, $xr1, 5
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a4, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a3, 1
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    bstrpick.d $a1, $t0, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a7, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT:    bstrpick.d $a1, $a6, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    bstrpick.d $a1, $a5, 31, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvmul.d $xr0, $xr1, $xr2
-; LA64-NEXT:    xvst $xr0, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_d_wu_w_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.d.wu.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll
index cd83c1dff652f..19b5ab50eef95 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll
@@ -7,11 +7,7 @@ define void @vmulwev_h_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT:    vslli.h $vr1, $vr1, 8
-; CHECK-NEXT:    vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwev.h.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -31,11 +27,7 @@ define void @vmulwev_w_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT:    vslli.w $vr1, $vr1, 16
-; CHECK-NEXT:    vsrai.w $vr1, $vr1, 16
-; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwev.w.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -55,11 +47,7 @@ define void @vmulwev_d_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT:    vslli.d $vr1, $vr1, 32
-; CHECK-NEXT:    vsrai.d $vr1, $vr1, 32
-; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwev.d.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -155,13 +143,7 @@ define void @vmulwod_h_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 49
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr1, 49
-; CHECK-NEXT:    vslli.h $vr1, $vr1, 8
-; CHECK-NEXT:    vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.h.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -181,13 +163,7 @@ define void @vmulwod_w_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 49
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr1, 49
-; CHECK-NEXT:    vslli.w $vr1, $vr1, 16
-; CHECK-NEXT:    vsrai.w $vr1, $vr1, 16
-; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.w.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -207,13 +183,7 @@ define void @vmulwod_d_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 49
-; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr1, 49
-; CHECK-NEXT:    vslli.d $vr1, $vr1, 32
-; CHECK-NEXT:    vsrai.d $vr1, $vr1, 32
-; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.d.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -308,13 +278,8 @@ define void @vmulwev_h_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_bu:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vld $vr2, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr3, $vr0, $vr1
-; CHECK-NEXT:    vshuf.b $vr1, $vr3, $vr2, $vr1
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vmulwev.h.bu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -332,15 +297,9 @@ entry:
 define void @vmulwev_w_hu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_w_hu:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr0, $a3, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vld $vr2, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vori.b $vr4, $vr0, 0
-; CHECK-NEXT:    vshuf.h $vr4, $vr3, $vr1
-; CHECK-NEXT:    vshuf.h $vr0, $vr3, $vr2
-; CHECK-NEXT:    vmul.w $vr0, $vr4, $vr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vmulwev.w.hu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -358,15 +317,9 @@ entry:
 define void @vmulwev_d_wu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_d_wu:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr0, $a3, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vld $vr2, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vori.b $vr4, $vr0, 0
-; CHECK-NEXT:    vshuf.w $vr4, $vr3, $vr1
-; CHECK-NEXT:    vshuf.w $vr0, $vr3, $vr2
-; CHECK-NEXT:    vmul.d $vr0, $vr4, $vr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vmulwev.d.wu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -442,10 +395,7 @@ define void @vmulwod_h_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.b $vr0, $vr2, $vr0
-; CHECK-NEXT:    vpackod.b $vr1, $vr2, $vr1
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.h.bu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -465,10 +415,7 @@ define void @vmulwod_w_hu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.h $vr0, $vr2, $vr0
-; CHECK-NEXT:    vpackod.h $vr1, $vr2, $vr1
-; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.w.hu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -488,10 +435,7 @@ define void @vmulwod_d_wu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.w $vr0, $vr2, $vr0
-; CHECK-NEXT:    vpackod.w $vr1, $vr2, $vr1
-; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.d.wu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -566,14 +510,8 @@ define void @vmulwev_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_bu_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI16_0)
-; CHECK-NEXT:    vld $vr2, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr3, $vr0, $vr1
-; CHECK-NEXT:    vslli.h $vr1, $vr2, 8
-; CHECK-NEXT:    vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vmulwev.h.bu.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -592,14 +530,8 @@ define void @vmulwev_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_w_hu_h:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI17_0)
-; CHECK-NEXT:    vld $vr2, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr3, $vr0
-; CHECK-NEXT:    vslli.w $vr0, $vr2, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT:    vmul.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vmulwev.w.hu.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -618,14 +550,8 @@ define void @vmulwev_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_d_wu_w:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI18_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI18_0)
-; CHECK-NEXT:    vld $vr2, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr3, $vr0
-; CHECK-NEXT:    vslli.d $vr0, $vr2, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT:    vmul.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vmulwev.d.wu.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -714,12 +640,7 @@ define void @vmulwod_h_bu_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.b $vr0, $vr2, $vr0
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr1, 49
-; CHECK-NEXT:    vslli.h $vr1, $vr1, 8
-; CHECK-NEXT:    vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.h.bu.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -739,12 +660,7 @@ define void @vmulwod_w_hu_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.h $vr0, $vr2, $vr0
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr1, 49
-; CHECK-NEXT:    vslli.w $vr1, $vr1, 16
-; CHECK-NEXT:    vsrai.w $vr1, $vr1, 16
-; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.w.hu.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -764,12 +680,7 @@ define void @vmulwod_d_wu_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.w $vr0, $vr2, $vr0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr1, 49
-; CHECK-NEXT:    vslli.d $vr1, $vr1, 32
-; CHECK-NEXT:    vsrai.d $vr1, $vr1, 32
-; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.d.wu.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -858,13 +769,7 @@ define void @vmulwev_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI24_0)
-; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI24_0)
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.b $vr1, $vr3, $vr1, $vr2
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwev.h.bu.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -884,13 +789,7 @@ define void @vmulwev_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
-; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI25_0)
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.h $vr2, $vr3, $vr1
-; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr2
+; CHECK-NEXT:    vmulwev.w.hu.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -910,13 +809,7 @@ define void @vmulwev_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI26_0)
-; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI26_0)
-; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.w $vr2, $vr3, $vr1
-; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr2
+; CHECK-NEXT:    vmulwev.d.wu.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -1005,12 +898,7 @@ define void @vmulwod_h_bu_b_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 49
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.b $vr1, $vr2, $vr1
-; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.h.bu.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -1030,12 +918,7 @@ define void @vmulwod_w_hu_h_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 49
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.h $vr1, $vr2, $vr1
-; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.w.hu.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -1055,12 +938,7 @@ define void @vmulwod_d_wu_w_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 49
-; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vpackod.w $vr1, $vr2, $vr1
-; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vmulwod.d.wu.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:

>From c0bb775e00274cbbd8ddbfd2d5e2b8677c9f2caa Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Thu, 9 Oct 2025 15:59:19 +0800
Subject: [PATCH 2/3] deal with lsx i128

---
 .../LoongArch/LoongArchISelLowering.cpp       |  56 ++-
 .../LoongArch/lsx/ir-instruction/mulwev_od.ll | 452 ++++--------------
 2 files changed, 130 insertions(+), 378 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2763cef394620..032032874cd11 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6691,15 +6691,15 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
 
   if (ResTy != MVT::v8i16 && ResTy != MVT::v4i32 && ResTy != MVT::v2i64 &&
-      ResTy != MVT::v16i16 && ResTy != MVT::v8i32 &&
-      ResTy != MVT::v4i64) // && ResTy != MVT::v2i128)
+      ResTy != MVT::v16i16 && ResTy != MVT::v8i32 && ResTy != MVT::v4i64 &&
+      ResTy != MVT::i128)
     return SDValue();
 
   // Combine:
   //   ti,tii,...,tx = extract_vector_elt t0, {0,2,4,.../1,3,5,...}
   //   tj,tjj,...,ty = extract_vector_elt t1, {0,2,4,.../1,3,5,...}
-  //   tm = BUILD_VECTOR ti,tii,...,tx
-  //   tn = BUILD_VECTOR tj,tjj,...,ty
+  //   tm = BUILD_VECTOR ti,tii,...,tx (Only when ResTy != MVT::i128)
+  //   tn = BUILD_VECTOR tj,tjj,...,ty (Only when ResTy != MVT::i128)
   //   ta = {sign/zero}_extend tm
   //   tb = {sign/zero}_extend tn
   //   tr = mul ta, tb
@@ -6721,24 +6721,36 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   if (ExtType < 0)
     return SDValue();
 
-  SDValue BV0 = N0.getOperand(0);
-  SDValue BV1 = N1.getOperand(0);
-  if (BV0.getOpcode() != ISD::BUILD_VECTOR ||
-      BV1.getOpcode() != ISD::BUILD_VECTOR)
+  SDValue Src0 = N0.getOperand(0);
+  SDValue Src1 = N1.getOperand(0);
+  bool IsScalar = (ResTy == MVT::i128);
+  if (IsScalar && (Src0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+                   Src1.getOpcode() != ISD::EXTRACT_VECTOR_ELT))
+    return SDValue();
+  if (!IsScalar && (Src0.getOpcode() != ISD::BUILD_VECTOR ||
+                    Src1.getOpcode() != ISD::BUILD_VECTOR))
     return SDValue();
 
-  unsigned ResBits = ResTy.getScalarType().getSizeInBits();
-  unsigned BV0Bits = BV0.getValueType().getScalarType().getSizeInBits();
-  unsigned BV1Bits = BV1.getValueType().getScalarType().getSizeInBits();
-  if (BV0Bits != BV1Bits || ResBits != BV0Bits * 2)
+  unsigned ResBits = ResTy.getScalarSizeInBits();
+  unsigned Src0Bits = Src0.getValueType().getScalarSizeInBits();
+  unsigned Src1Bits = Src1.getValueType().getScalarSizeInBits();
+  if (Src0Bits != Src1Bits || ResBits != Src0Bits * 2)
     return SDValue();
 
+  // Collect all EXTRACT_VECTOR_ELT.
+  SmallVector<std::pair<SDValue, SDValue>> Elems;
+  if (IsScalar) {
+    Elems.emplace_back(Src0, Src1);
+  } else {
+    for (unsigned i = 0; i < Src0.getNumOperands(); ++i)
+      Elems.emplace_back(Src0.getOperand(i), Src1.getOperand(i));
+  }
+
   unsigned Index;
   SDValue OrigN0, OrigN1;
-  for (unsigned i = 0; i < BV0.getNumOperands(); ++i) {
-    SDValue Op0 = BV0.getOperand(i);
-    SDValue Op1 = BV1.getOperand(i);
-    // Each element of BUILD_VECTOR must be EXTRACT_VECTOR_ELT.
+  bool First = true;
+  for (auto &[Op0, Op1] : Elems) {
+    // Each element must be EXTRACT_VECTOR_ELT.
     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
@@ -6750,17 +6762,17 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
     auto *IdxC = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
     if (!IdxC)
       return SDValue();
-    unsigned CurIdx = IdxC->getZExtValue();
 
-    if (i == 0) {
+    unsigned CurIdx = IdxC->getZExtValue();
+    if (First) {
       if (CurIdx != 0 && CurIdx != 1)
         return SDValue();
       OrigN0 = Op0.getOperand(0);
       OrigN1 = Op1.getOperand(0);
+      First = false;
     } else {
-      if (CurIdx != Index + 2)
-        return SDValue();
-      if (Op0.getOperand(0) != OrigN0 || Op1.getOperand(0) != OrigN1)
+      if (CurIdx != Index + 2 || Op0.getOperand(0) != OrigN0 ||
+          Op1.getOperand(0) != OrigN1)
         return SDValue();
     }
     Index = CurIdx;
@@ -6769,7 +6781,7 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   if (OrigN0.getValueType() != OrigN1.getValueType())
     return SDValue();
   if (OrigN0.getValueType().getVectorNumElements() !=
-      ResTy.getVectorNumElements() * 2)
+      (IsScalar ? 1 : ResTy.getVectorNumElements()) * 2)
     return SDValue();
 
   SDValue Result;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll
index 19b5ab50eef95..3036fcfa49fec 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mulwev_od.ll
@@ -67,64 +67,23 @@ define void @vmulwev_q_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vmulwev.q.d $vr0, $vr0, $vr1
 ; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
 ; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 0
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 1
-; LA32-NEXT:    srai.w $a5, $a2, 31
-; LA32-NEXT:    srai.w $a6, $a4, 31
-; LA32-NEXT:    mulh.wu $a7, $a1, $a3
-; LA32-NEXT:    mul.w $t0, $a2, $a3
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    sltu $t0, $a7, $t0
-; LA32-NEXT:    mulh.wu $t1, $a2, $a3
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    mul.w $t1, $a1, $a4
-; LA32-NEXT:    add.w $a7, $t1, $a7
-; LA32-NEXT:    sltu $t1, $a7, $t1
-; LA32-NEXT:    mulh.wu $t2, $a1, $a4
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    add.w $t1, $t0, $t1
-; LA32-NEXT:    mul.w $t2, $a2, $a4
-; LA32-NEXT:    add.w $t3, $t2, $t1
-; LA32-NEXT:    mul.w $t4, $a3, $a5
-; LA32-NEXT:    mul.w $t5, $a6, $a1
-; LA32-NEXT:    add.w $t6, $t5, $t4
-; LA32-NEXT:    add.w $t7, $t3, $t6
-; LA32-NEXT:    sltu $t8, $t7, $t3
-; LA32-NEXT:    sltu $t2, $t3, $t2
-; LA32-NEXT:    sltu $t0, $t1, $t0
-; LA32-NEXT:    mulh.wu $t1, $a2, $a4
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    add.w $t0, $t0, $t2
-; LA32-NEXT:    mulh.wu $t1, $a3, $a5
-; LA32-NEXT:    add.w $t1, $t1, $t4
-; LA32-NEXT:    mul.w $a4, $a4, $a5
-; LA32-NEXT:    add.w $a4, $t1, $a4
-; LA32-NEXT:    mul.w $a2, $a6, $a2
-; LA32-NEXT:    mulh.wu $a5, $a6, $a1
-; LA32-NEXT:    add.w $a2, $a5, $a2
-; LA32-NEXT:    add.w $a2, $a2, $t5
-; LA32-NEXT:    add.w $a2, $a2, $a4
-; LA32-NEXT:    sltu $a4, $t6, $t5
-; LA32-NEXT:    add.w $a2, $a2, $a4
-; LA32-NEXT:    add.w $a2, $t0, $a2
-; LA32-NEXT:    add.w $a2, $a2, $t8
-; LA32-NEXT:    mul.w $a1, $a1, $a3
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
 ; LA32-NEXT:    st.w $a1, $a0, 0
-; LA32-NEXT:    st.w $a7, $a0, 4
-; LA32-NEXT:    st.w $t7, $a0, 8
-; LA32-NEXT:    st.w $a2, $a0, 12
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwev_q_d:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 0
-; LA64-NEXT:    ld.d $a2, $a2, 0
-; LA64-NEXT:    mul.d $a3, $a1, $a2
-; LA64-NEXT:    mulh.d $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 8
-; LA64-NEXT:    st.d $a3, $a0, 0
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwev.q.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -203,64 +162,23 @@ define void @vmulwod_q_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
-; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 2
-; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 3
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 2
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 3
-; LA32-NEXT:    srai.w $a5, $a2, 31
-; LA32-NEXT:    srai.w $a6, $a4, 31
-; LA32-NEXT:    mulh.wu $a7, $a1, $a3
-; LA32-NEXT:    mul.w $t0, $a2, $a3
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    sltu $t0, $a7, $t0
-; LA32-NEXT:    mulh.wu $t1, $a2, $a3
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    mul.w $t1, $a1, $a4
-; LA32-NEXT:    add.w $a7, $t1, $a7
-; LA32-NEXT:    sltu $t1, $a7, $t1
-; LA32-NEXT:    mulh.wu $t2, $a1, $a4
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    add.w $t1, $t0, $t1
-; LA32-NEXT:    mul.w $t2, $a2, $a4
-; LA32-NEXT:    add.w $t3, $t2, $t1
-; LA32-NEXT:    mul.w $t4, $a3, $a5
-; LA32-NEXT:    mul.w $t5, $a6, $a1
-; LA32-NEXT:    add.w $t6, $t5, $t4
-; LA32-NEXT:    add.w $t7, $t3, $t6
-; LA32-NEXT:    sltu $t8, $t7, $t3
-; LA32-NEXT:    sltu $t2, $t3, $t2
-; LA32-NEXT:    sltu $t0, $t1, $t0
-; LA32-NEXT:    mulh.wu $t1, $a2, $a4
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    add.w $t0, $t0, $t2
-; LA32-NEXT:    mulh.wu $t1, $a3, $a5
-; LA32-NEXT:    add.w $t1, $t1, $t4
-; LA32-NEXT:    mul.w $a4, $a4, $a5
-; LA32-NEXT:    add.w $a4, $t1, $a4
-; LA32-NEXT:    mul.w $a2, $a6, $a2
-; LA32-NEXT:    mulh.wu $a5, $a6, $a1
-; LA32-NEXT:    add.w $a2, $a5, $a2
-; LA32-NEXT:    add.w $a2, $a2, $t5
-; LA32-NEXT:    add.w $a2, $a2, $a4
-; LA32-NEXT:    sltu $a4, $t6, $t5
-; LA32-NEXT:    add.w $a2, $a2, $a4
-; LA32-NEXT:    add.w $a2, $t0, $a2
-; LA32-NEXT:    add.w $a2, $a2, $t8
-; LA32-NEXT:    mul.w $a1, $a1, $a3
+; LA32-NEXT:    vmulwod.q.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
+; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
 ; LA32-NEXT:    st.w $a1, $a0, 0
-; LA32-NEXT:    st.w $a7, $a0, 4
-; LA32-NEXT:    st.w $t7, $a0, 8
-; LA32-NEXT:    st.w $a2, $a0, 12
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwod_q_d:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 8
-; LA64-NEXT:    ld.d $a2, $a2, 8
-; LA64-NEXT:    mul.d $a3, $a1, $a2
-; LA64-NEXT:    mulh.d $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 8
-; LA64-NEXT:    st.d $a3, $a0, 0
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwod.q.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -339,44 +257,23 @@ define void @vmulwev_q_du(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
-; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 1
-; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 0
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 1
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 0
-; LA32-NEXT:    mulh.wu $a5, $a2, $a4
-; LA32-NEXT:    mul.w $a6, $a1, $a4
-; LA32-NEXT:    add.w $a5, $a6, $a5
-; LA32-NEXT:    sltu $a6, $a5, $a6
-; LA32-NEXT:    mulh.wu $a7, $a1, $a4
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    mul.w $a7, $a2, $a3
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    sltu $a7, $a5, $a7
-; LA32-NEXT:    mulh.wu $t0, $a2, $a3
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    add.w $a7, $a6, $a7
-; LA32-NEXT:    mul.w $t0, $a1, $a3
-; LA32-NEXT:    add.w $t1, $t0, $a7
-; LA32-NEXT:    sltu $t0, $t1, $t0
-; LA32-NEXT:    sltu $a6, $a7, $a6
-; LA32-NEXT:    mulh.wu $a1, $a1, $a3
-; LA32-NEXT:    add.w $a1, $a1, $a6
-; LA32-NEXT:    add.w $a1, $a1, $t0
-; LA32-NEXT:    mul.w $a2, $a2, $a4
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $a5, $a0, 4
-; LA32-NEXT:    st.w $t1, $a0, 8
-; LA32-NEXT:    st.w $a1, $a0, 12
+; LA32-NEXT:    vmulwev.q.du $vr0, $vr0, $vr1
+; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
+; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    st.w $a1, $a0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwev_q_du:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 0
-; LA64-NEXT:    ld.d $a2, $a2, 0
-; LA64-NEXT:    mul.d $a3, $a1, $a2
-; LA64-NEXT:    mulh.du $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 8
-; LA64-NEXT:    st.d $a3, $a0, 0
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwev.q.du $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -455,44 +352,23 @@ define void @vmulwod_q_du(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
-; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 3
-; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 2
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 3
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 2
-; LA32-NEXT:    mulh.wu $a5, $a2, $a4
-; LA32-NEXT:    mul.w $a6, $a1, $a4
-; LA32-NEXT:    add.w $a5, $a6, $a5
-; LA32-NEXT:    sltu $a6, $a5, $a6
-; LA32-NEXT:    mulh.wu $a7, $a1, $a4
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    mul.w $a7, $a2, $a3
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    sltu $a7, $a5, $a7
-; LA32-NEXT:    mulh.wu $t0, $a2, $a3
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    add.w $a7, $a6, $a7
-; LA32-NEXT:    mul.w $t0, $a1, $a3
-; LA32-NEXT:    add.w $t1, $t0, $a7
-; LA32-NEXT:    sltu $t0, $t1, $t0
-; LA32-NEXT:    sltu $a6, $a7, $a6
-; LA32-NEXT:    mulh.wu $a1, $a1, $a3
-; LA32-NEXT:    add.w $a1, $a1, $a6
-; LA32-NEXT:    add.w $a1, $a1, $t0
-; LA32-NEXT:    mul.w $a2, $a2, $a4
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $a5, $a0, 4
-; LA32-NEXT:    st.w $t1, $a0, 8
-; LA32-NEXT:    st.w $a1, $a0, 12
+; LA32-NEXT:    vmulwod.q.du $vr0, $vr0, $vr1
+; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
+; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    st.w $a1, $a0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwod_q_du:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 8
-; LA64-NEXT:    ld.d $a2, $a2, 8
-; LA64-NEXT:    mul.d $a3, $a1, $a2
-; LA64-NEXT:    mulh.du $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 8
-; LA64-NEXT:    st.d $a3, $a0, 0
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwod.q.du $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -571,57 +447,23 @@ define void @vmulwev_q_du_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
-; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 1
-; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 0
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 0
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 1
-; LA32-NEXT:    srai.w $a5, $a4, 31
-; LA32-NEXT:    mulh.wu $a6, $a2, $a3
-; LA32-NEXT:    mul.w $a7, $a1, $a3
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    sltu $a7, $a6, $a7
-; LA32-NEXT:    mulh.wu $t0, $a1, $a3
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    mul.w $t0, $a2, $a4
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    sltu $t0, $a6, $t0
-; LA32-NEXT:    mulh.wu $t1, $a2, $a4
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    add.w $t0, $a7, $t0
-; LA32-NEXT:    mul.w $t1, $a1, $a4
-; LA32-NEXT:    add.w $t2, $t1, $t0
-; LA32-NEXT:    mul.w $t3, $a5, $a2
-; LA32-NEXT:    add.w $t4, $t2, $t3
-; LA32-NEXT:    sltu $t5, $t4, $t2
-; LA32-NEXT:    sltu $t1, $t2, $t1
-; LA32-NEXT:    sltu $a7, $t0, $a7
-; LA32-NEXT:    mulh.wu $a4, $a1, $a4
-; LA32-NEXT:    add.w $a4, $a4, $a7
-; LA32-NEXT:    add.w $a4, $a4, $t1
-; LA32-NEXT:    mul.w $a1, $a5, $a1
-; LA32-NEXT:    mulh.wu $a5, $a5, $a2
-; LA32-NEXT:    add.w $a1, $a5, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t3
-; LA32-NEXT:    add.w $a1, $a4, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t5
-; LA32-NEXT:    mul.w $a2, $a2, $a3
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $a6, $a0, 4
-; LA32-NEXT:    st.w $t4, $a0, 8
-; LA32-NEXT:    st.w $a1, $a0, 12
+; LA32-NEXT:    vmulwev.q.du.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
+; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    st.w $a1, $a0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwev_q_du_d:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a2, $a2, 0
-; LA64-NEXT:    ld.d $a1, $a1, 0
-; LA64-NEXT:    srai.d $a3, $a2, 63
-; LA64-NEXT:    mulh.du $a4, $a1, $a2
-; LA64-NEXT:    mul.d $a3, $a1, $a3
-; LA64-NEXT:    add.d $a3, $a4, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    st.d $a3, $a0, 8
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwev.q.du.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -700,57 +542,23 @@ define void @vmulwod_q_du_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
-; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 3
-; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 2
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 2
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 3
-; LA32-NEXT:    srai.w $a5, $a4, 31
-; LA32-NEXT:    mulh.wu $a6, $a2, $a3
-; LA32-NEXT:    mul.w $a7, $a1, $a3
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    sltu $a7, $a6, $a7
-; LA32-NEXT:    mulh.wu $t0, $a1, $a3
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    mul.w $t0, $a2, $a4
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    sltu $t0, $a6, $t0
-; LA32-NEXT:    mulh.wu $t1, $a2, $a4
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    add.w $t0, $a7, $t0
-; LA32-NEXT:    mul.w $t1, $a1, $a4
-; LA32-NEXT:    add.w $t2, $t1, $t0
-; LA32-NEXT:    mul.w $t3, $a5, $a2
-; LA32-NEXT:    add.w $t4, $t2, $t3
-; LA32-NEXT:    sltu $t5, $t4, $t2
-; LA32-NEXT:    sltu $t1, $t2, $t1
-; LA32-NEXT:    sltu $a7, $t0, $a7
-; LA32-NEXT:    mulh.wu $a4, $a1, $a4
-; LA32-NEXT:    add.w $a4, $a4, $a7
-; LA32-NEXT:    add.w $a4, $a4, $t1
-; LA32-NEXT:    mul.w $a1, $a5, $a1
-; LA32-NEXT:    mulh.wu $a5, $a5, $a2
-; LA32-NEXT:    add.w $a1, $a5, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t3
-; LA32-NEXT:    add.w $a1, $a4, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t5
-; LA32-NEXT:    mul.w $a2, $a2, $a3
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $a6, $a0, 4
-; LA32-NEXT:    st.w $t4, $a0, 8
-; LA32-NEXT:    st.w $a1, $a0, 12
+; LA32-NEXT:    vmulwod.q.du.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
+; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    st.w $a1, $a0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwod_q_du_d:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a2, $a2, 8
-; LA64-NEXT:    ld.d $a1, $a1, 8
-; LA64-NEXT:    srai.d $a3, $a2, 63
-; LA64-NEXT:    mulh.du $a4, $a1, $a2
-; LA64-NEXT:    mul.d $a3, $a1, $a3
-; LA64-NEXT:    add.d $a3, $a4, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    st.d $a3, $a0, 8
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwod.q.du.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -829,57 +637,23 @@ define void @vmulwev_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vmulwev.q.du.d $vr0, $vr1, $vr0
 ; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
 ; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 1
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 0
-; LA32-NEXT:    srai.w $a5, $a2, 31
-; LA32-NEXT:    mulh.wu $a6, $a1, $a4
-; LA32-NEXT:    mul.w $a7, $a2, $a4
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    sltu $a7, $a6, $a7
-; LA32-NEXT:    mulh.wu $t0, $a2, $a4
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    mul.w $t0, $a1, $a3
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    sltu $t0, $a6, $t0
-; LA32-NEXT:    mulh.wu $t1, $a1, $a3
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    add.w $t0, $a7, $t0
-; LA32-NEXT:    mul.w $t1, $a2, $a3
-; LA32-NEXT:    add.w $t2, $t1, $t0
-; LA32-NEXT:    mul.w $t3, $a4, $a5
-; LA32-NEXT:    add.w $t4, $t2, $t3
-; LA32-NEXT:    sltu $t5, $t4, $t2
-; LA32-NEXT:    sltu $t1, $t2, $t1
-; LA32-NEXT:    sltu $a7, $t0, $a7
-; LA32-NEXT:    mulh.wu $a2, $a2, $a3
-; LA32-NEXT:    add.w $a2, $a2, $a7
-; LA32-NEXT:    add.w $a2, $a2, $t1
-; LA32-NEXT:    mulh.wu $a7, $a4, $a5
-; LA32-NEXT:    add.w $a7, $a7, $t3
-; LA32-NEXT:    mul.w $a3, $a3, $a5
-; LA32-NEXT:    add.w $a3, $a7, $a3
-; LA32-NEXT:    add.w $a2, $a2, $a3
-; LA32-NEXT:    add.w $a2, $a2, $t5
-; LA32-NEXT:    mul.w $a1, $a1, $a4
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
 ; LA32-NEXT:    st.w $a1, $a0, 0
-; LA32-NEXT:    st.w $a6, $a0, 4
-; LA32-NEXT:    st.w $t4, $a0, 8
-; LA32-NEXT:    st.w $a2, $a0, 12
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwev_q_du_d_1:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 0
-; LA64-NEXT:    ld.d $a2, $a2, 0
-; LA64-NEXT:    srai.d $a3, $a1, 63
-; LA64-NEXT:    mulh.du $a4, $a1, $a2
-; LA64-NEXT:    mul.d $a3, $a3, $a2
-; LA64-NEXT:    add.d $a3, $a4, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    st.d $a3, $a0, 8
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwev.q.du.d $vr0, $vr1, $vr0
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
@@ -958,57 +732,23 @@ define void @vmulwod_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    vld $vr0, $a1, 0
 ; LA32-NEXT:    vld $vr1, $a2, 0
-; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 2
-; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 3
-; LA32-NEXT:    vpickve2gr.w $a3, $vr1, 3
-; LA32-NEXT:    vpickve2gr.w $a4, $vr1, 2
-; LA32-NEXT:    srai.w $a5, $a2, 31
-; LA32-NEXT:    mulh.wu $a6, $a1, $a4
-; LA32-NEXT:    mul.w $a7, $a2, $a4
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    sltu $a7, $a6, $a7
-; LA32-NEXT:    mulh.wu $t0, $a2, $a4
-; LA32-NEXT:    add.w $a7, $t0, $a7
-; LA32-NEXT:    mul.w $t0, $a1, $a3
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    sltu $t0, $a6, $t0
-; LA32-NEXT:    mulh.wu $t1, $a1, $a3
-; LA32-NEXT:    add.w $t0, $t1, $t0
-; LA32-NEXT:    add.w $t0, $a7, $t0
-; LA32-NEXT:    mul.w $t1, $a2, $a3
-; LA32-NEXT:    add.w $t2, $t1, $t0
-; LA32-NEXT:    mul.w $t3, $a4, $a5
-; LA32-NEXT:    add.w $t4, $t2, $t3
-; LA32-NEXT:    sltu $t5, $t4, $t2
-; LA32-NEXT:    sltu $t1, $t2, $t1
-; LA32-NEXT:    sltu $a7, $t0, $a7
-; LA32-NEXT:    mulh.wu $a2, $a2, $a3
-; LA32-NEXT:    add.w $a2, $a2, $a7
-; LA32-NEXT:    add.w $a2, $a2, $t1
-; LA32-NEXT:    mulh.wu $a7, $a4, $a5
-; LA32-NEXT:    add.w $a7, $a7, $t3
-; LA32-NEXT:    mul.w $a3, $a3, $a5
-; LA32-NEXT:    add.w $a3, $a7, $a3
-; LA32-NEXT:    add.w $a2, $a2, $a3
-; LA32-NEXT:    add.w $a2, $a2, $t5
-; LA32-NEXT:    mul.w $a1, $a1, $a4
+; LA32-NEXT:    vmulwod.q.du.d $vr0, $vr1, $vr0
+; LA32-NEXT:    vpickve2gr.w $a1, $vr0, 0
+; LA32-NEXT:    vpickve2gr.w $a2, $vr0, 1
+; LA32-NEXT:    vpickve2gr.w $a3, $vr0, 2
+; LA32-NEXT:    vpickve2gr.w $a4, $vr0, 3
+; LA32-NEXT:    st.w $a4, $a0, 12
+; LA32-NEXT:    st.w $a3, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 4
 ; LA32-NEXT:    st.w $a1, $a0, 0
-; LA32-NEXT:    st.w $a6, $a0, 4
-; LA32-NEXT:    st.w $t4, $a0, 8
-; LA32-NEXT:    st.w $a2, $a0, 12
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: vmulwod_q_du_d_1:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 8
-; LA64-NEXT:    ld.d $a2, $a2, 8
-; LA64-NEXT:    srai.d $a3, $a1, 63
-; LA64-NEXT:    mulh.du $a4, $a1, $a2
-; LA64-NEXT:    mul.d $a3, $a3, $a2
-; LA64-NEXT:    add.d $a3, $a4, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a2
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    st.d $a3, $a0, 8
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vmulwod.q.du.d $vr0, $vr1, $vr0
+; LA64-NEXT:    vst $vr0, $a0, 0
 ; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a

>From f3e91f4976d0f96de0f0b8a24cdb2be67e77df75 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Thu, 9 Oct 2025 17:21:23 +0800
Subject: [PATCH 3/3] including v2i128

---
 .../LoongArch/LoongArchISelLowering.cpp       |  17 +-
 .../lasx/ir-instruction/mulwev_od.ll          | 928 ++----------------
 2 files changed, 71 insertions(+), 874 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 032032874cd11..9952cfaefee57 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6681,7 +6681,8 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const LoongArchSubtarget &Subtarget) {
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
@@ -6690,9 +6691,17 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Note: v2i128 is an unsupported MVT vector type (see
+  // MachineValueType.h::getVectorVT()), use NumElements and SizeInBits to
+  // identify it.
+  bool HasLSXOnly = Subtarget.hasExtLSX() && !Subtarget.hasExtLASX();
+  bool Isv2i128 = ResTy.isVector() && ResTy.getVectorNumElements() == 2 &&
+                  ResTy.getScalarSizeInBits() == 128;
   if (ResTy != MVT::v8i16 && ResTy != MVT::v4i32 && ResTy != MVT::v2i64 &&
-      ResTy != MVT::v16i16 && ResTy != MVT::v8i32 && ResTy != MVT::v4i64 &&
-      ResTy != MVT::i128)
+      ResTy != MVT::i128 && ResTy != MVT::v16i16 && ResTy != MVT::v8i32 &&
+      ResTy != MVT::v4i64 && !Isv2i128)
+    return SDValue();
+  if (HasLSXOnly && (ResTy.is256BitVector() || Isv2i128))
     return SDValue();
 
   // Combine:
@@ -6837,7 +6846,7 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::EXTRACT_VECTOR_ELT:
     return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
-    return performMULCombine(N, DAG, DCI);
+    return performMULCombine(N, DAG, DCI, Subtarget);
   }
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll
index 605325f4dc4f4..ed3a31d12ee83 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mulwev_od.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
 
 define void @vmulwev_h_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vmulwev_h_b:
@@ -63,139 +63,13 @@ entry:
 }
 
 define void @vmulwev_q_d(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_q_d:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -32
-; LA32-NEXT:    st.w $fp, $sp, 28 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s0, $sp, 24 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s1, $sp, 20 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s2, $sp, 16 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s3, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s4, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $t3, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr0, 5
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr1, 4
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr1, 0
-; LA32-NEXT:    xvpickve2gr.w $t4, $xr1, 1
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 5
-; LA32-NEXT:    srai.w $t1, $a5, 31
-; LA32-NEXT:    srai.w $t5, $t3, 31
-; LA32-NEXT:    srai.w $t0, $a7, 31
-; LA32-NEXT:    srai.w $t6, $t4, 31
-; LA32-NEXT:    mulh.wu $a6, $a2, $a4
-; LA32-NEXT:    mul.w $t2, $t3, $a4
-; LA32-NEXT:    add.w $a6, $t2, $a6
-; LA32-NEXT:    sltu $t2, $a6, $t2
-; LA32-NEXT:    mulh.wu $t7, $t3, $a4
-; LA32-NEXT:    add.w $t7, $t7, $t2
-; LA32-NEXT:    mul.w $t2, $a2, $t4
-; LA32-NEXT:    add.w $a6, $t2, $a6
-; LA32-NEXT:    sltu $t2, $a6, $t2
-; LA32-NEXT:    mulh.wu $t8, $a2, $t4
-; LA32-NEXT:    add.w $t2, $t8, $t2
-; LA32-NEXT:    add.w $t8, $t7, $t2
-; LA32-NEXT:    mul.w $fp, $t3, $t4
-; LA32-NEXT:    add.w $s0, $fp, $t8
-; LA32-NEXT:    mul.w $s1, $a4, $t5
-; LA32-NEXT:    mul.w $s2, $t6, $a2
-; LA32-NEXT:    add.w $s3, $s2, $s1
-; LA32-NEXT:    add.w $t2, $s0, $s3
-; LA32-NEXT:    sltu $s4, $t2, $s0
-; LA32-NEXT:    sltu $fp, $s0, $fp
-; LA32-NEXT:    sltu $t7, $t8, $t7
-; LA32-NEXT:    mulh.wu $t8, $t3, $t4
-; LA32-NEXT:    add.w $t7, $t8, $t7
-; LA32-NEXT:    add.w $t7, $t7, $fp
-; LA32-NEXT:    mulh.wu $t8, $a4, $t5
-; LA32-NEXT:    add.w $t8, $t8, $s1
-; LA32-NEXT:    mul.w $t4, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t8, $t4
-; LA32-NEXT:    mul.w $t3, $t6, $t3
-; LA32-NEXT:    mulh.wu $t5, $t6, $a2
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    add.w $t3, $t3, $s2
-; LA32-NEXT:    add.w $t3, $t3, $t4
-; LA32-NEXT:    sltu $t4, $s3, $s2
-; LA32-NEXT:    add.w $t3, $t3, $t4
-; LA32-NEXT:    add.w $t3, $t7, $t3
-; LA32-NEXT:    add.w $t3, $t3, $s4
-; LA32-NEXT:    mulh.wu $t4, $a1, $a3
-; LA32-NEXT:    mul.w $t5, $a5, $a3
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    sltu $t5, $t4, $t5
-; LA32-NEXT:    mulh.wu $t6, $a5, $a3
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    mul.w $t6, $a1, $a7
-; LA32-NEXT:    add.w $t4, $t6, $t4
-; LA32-NEXT:    sltu $t6, $t4, $t6
-; LA32-NEXT:    mulh.wu $t7, $a1, $a7
-; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    add.w $t6, $t5, $t6
-; LA32-NEXT:    mul.w $t7, $a5, $a7
-; LA32-NEXT:    add.w $t8, $t7, $t6
-; LA32-NEXT:    mul.w $fp, $a3, $t1
-; LA32-NEXT:    mul.w $s0, $t0, $a1
-; LA32-NEXT:    add.w $s1, $s0, $fp
-; LA32-NEXT:    add.w $s2, $t8, $s1
-; LA32-NEXT:    sltu $s3, $s2, $t8
-; LA32-NEXT:    sltu $t7, $t8, $t7
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    mulh.wu $t6, $a5, $a7
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    add.w $t5, $t5, $t7
-; LA32-NEXT:    mulh.wu $t6, $a3, $t1
-; LA32-NEXT:    add.w $t6, $t6, $fp
-; LA32-NEXT:    mul.w $a7, $a7, $t1
-; LA32-NEXT:    add.w $a7, $t6, $a7
-; LA32-NEXT:    mul.w $a5, $t0, $a5
-; LA32-NEXT:    mulh.wu $t0, $t0, $a1
-; LA32-NEXT:    add.w $a5, $t0, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s0
-; LA32-NEXT:    add.w $a5, $a5, $a7
-; LA32-NEXT:    sltu $a7, $s1, $s0
-; LA32-NEXT:    add.w $a5, $a5, $a7
-; LA32-NEXT:    add.w $a5, $t5, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s3
-; LA32-NEXT:    mul.w $a2, $a2, $a4
-; LA32-NEXT:    mul.w $a1, $a1, $a3
-; LA32-NEXT:    st.w $a1, $a0, 16
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $t4, $a0, 20
-; LA32-NEXT:    st.w $a6, $a0, 4
-; LA32-NEXT:    st.w $s2, $a0, 24
-; LA32-NEXT:    st.w $t2, $a0, 8
-; LA32-NEXT:    st.w $a5, $a0, 28
-; LA32-NEXT:    st.w $t3, $a0, 12
-; LA32-NEXT:    ld.w $s4, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s3, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s2, $sp, 16 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s1, $sp, 20 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s0, $sp, 24 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $fp, $sp, 28 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 32
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_q_d:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 0
-; LA64-NEXT:    mul.d $a5, $a2, $a4
-; LA64-NEXT:    mulh.d $a2, $a2, $a4
-; LA64-NEXT:    mul.d $a4, $a1, $a3
-; LA64-NEXT:    mulh.d $a1, $a1, $a3
-; LA64-NEXT:    st.d $a1, $a0, 24
-; LA64-NEXT:    st.d $a4, $a0, 16
-; LA64-NEXT:    st.d $a2, $a0, 8
-; LA64-NEXT:    st.d $a5, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_q_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.q.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -269,139 +143,13 @@ entry:
 }
 
 define void @vmulwod_q_d(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_q_d:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -32
-; LA32-NEXT:    st.w $fp, $sp, 28 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s0, $sp, 24 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s1, $sp, 20 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s2, $sp, 16 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s3, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s4, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $t3, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr0, 7
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr1, 6
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr1, 2
-; LA32-NEXT:    xvpickve2gr.w $t4, $xr1, 3
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 7
-; LA32-NEXT:    srai.w $t1, $a5, 31
-; LA32-NEXT:    srai.w $t5, $t3, 31
-; LA32-NEXT:    srai.w $t0, $a7, 31
-; LA32-NEXT:    srai.w $t6, $t4, 31
-; LA32-NEXT:    mulh.wu $a6, $a2, $a4
-; LA32-NEXT:    mul.w $t2, $t3, $a4
-; LA32-NEXT:    add.w $a6, $t2, $a6
-; LA32-NEXT:    sltu $t2, $a6, $t2
-; LA32-NEXT:    mulh.wu $t7, $t3, $a4
-; LA32-NEXT:    add.w $t7, $t7, $t2
-; LA32-NEXT:    mul.w $t2, $a2, $t4
-; LA32-NEXT:    add.w $a6, $t2, $a6
-; LA32-NEXT:    sltu $t2, $a6, $t2
-; LA32-NEXT:    mulh.wu $t8, $a2, $t4
-; LA32-NEXT:    add.w $t2, $t8, $t2
-; LA32-NEXT:    add.w $t8, $t7, $t2
-; LA32-NEXT:    mul.w $fp, $t3, $t4
-; LA32-NEXT:    add.w $s0, $fp, $t8
-; LA32-NEXT:    mul.w $s1, $a4, $t5
-; LA32-NEXT:    mul.w $s2, $t6, $a2
-; LA32-NEXT:    add.w $s3, $s2, $s1
-; LA32-NEXT:    add.w $t2, $s0, $s3
-; LA32-NEXT:    sltu $s4, $t2, $s0
-; LA32-NEXT:    sltu $fp, $s0, $fp
-; LA32-NEXT:    sltu $t7, $t8, $t7
-; LA32-NEXT:    mulh.wu $t8, $t3, $t4
-; LA32-NEXT:    add.w $t7, $t8, $t7
-; LA32-NEXT:    add.w $t7, $t7, $fp
-; LA32-NEXT:    mulh.wu $t8, $a4, $t5
-; LA32-NEXT:    add.w $t8, $t8, $s1
-; LA32-NEXT:    mul.w $t4, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t8, $t4
-; LA32-NEXT:    mul.w $t3, $t6, $t3
-; LA32-NEXT:    mulh.wu $t5, $t6, $a2
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    add.w $t3, $t3, $s2
-; LA32-NEXT:    add.w $t3, $t3, $t4
-; LA32-NEXT:    sltu $t4, $s3, $s2
-; LA32-NEXT:    add.w $t3, $t3, $t4
-; LA32-NEXT:    add.w $t3, $t7, $t3
-; LA32-NEXT:    add.w $t3, $t3, $s4
-; LA32-NEXT:    mulh.wu $t4, $a1, $a3
-; LA32-NEXT:    mul.w $t5, $a5, $a3
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    sltu $t5, $t4, $t5
-; LA32-NEXT:    mulh.wu $t6, $a5, $a3
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    mul.w $t6, $a1, $a7
-; LA32-NEXT:    add.w $t4, $t6, $t4
-; LA32-NEXT:    sltu $t6, $t4, $t6
-; LA32-NEXT:    mulh.wu $t7, $a1, $a7
-; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    add.w $t6, $t5, $t6
-; LA32-NEXT:    mul.w $t7, $a5, $a7
-; LA32-NEXT:    add.w $t8, $t7, $t6
-; LA32-NEXT:    mul.w $fp, $a3, $t1
-; LA32-NEXT:    mul.w $s0, $t0, $a1
-; LA32-NEXT:    add.w $s1, $s0, $fp
-; LA32-NEXT:    add.w $s2, $t8, $s1
-; LA32-NEXT:    sltu $s3, $s2, $t8
-; LA32-NEXT:    sltu $t7, $t8, $t7
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    mulh.wu $t6, $a5, $a7
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    add.w $t5, $t5, $t7
-; LA32-NEXT:    mulh.wu $t6, $a3, $t1
-; LA32-NEXT:    add.w $t6, $t6, $fp
-; LA32-NEXT:    mul.w $a7, $a7, $t1
-; LA32-NEXT:    add.w $a7, $t6, $a7
-; LA32-NEXT:    mul.w $a5, $t0, $a5
-; LA32-NEXT:    mulh.wu $t0, $t0, $a1
-; LA32-NEXT:    add.w $a5, $t0, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s0
-; LA32-NEXT:    add.w $a5, $a5, $a7
-; LA32-NEXT:    sltu $a7, $s1, $s0
-; LA32-NEXT:    add.w $a5, $a5, $a7
-; LA32-NEXT:    add.w $a5, $t5, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s3
-; LA32-NEXT:    mul.w $a2, $a2, $a4
-; LA32-NEXT:    mul.w $a1, $a1, $a3
-; LA32-NEXT:    st.w $a1, $a0, 16
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $t4, $a0, 20
-; LA32-NEXT:    st.w $a6, $a0, 4
-; LA32-NEXT:    st.w $s2, $a0, 24
-; LA32-NEXT:    st.w $t2, $a0, 8
-; LA32-NEXT:    st.w $a5, $a0, 28
-; LA32-NEXT:    st.w $t3, $a0, 12
-; LA32-NEXT:    ld.w $s4, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s3, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s2, $sp, 16 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s1, $sp, 20 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $s0, $sp, 24 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $fp, $sp, 28 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 32
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_q_d:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 1
-; LA64-NEXT:    mul.d $a5, $a2, $a4
-; LA64-NEXT:    mulh.d $a2, $a2, $a4
-; LA64-NEXT:    mul.d $a4, $a1, $a3
-; LA64-NEXT:    mulh.d $a1, $a1, $a3
-; LA64-NEXT:    st.d $a1, $a0, 24
-; LA64-NEXT:    st.d $a4, $a0, 16
-; LA64-NEXT:    st.d $a2, $a0, 8
-; LA64-NEXT:    st.d $a5, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_q_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.q.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -475,85 +223,13 @@ entry:
 }
 
 define void @vmulwev_q_du(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_q_du:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 5
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 4
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 5
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr1, 4
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 1
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 0
-; LA32-NEXT:    mulh.wu $t1, $a4, $t0
-; LA32-NEXT:    mul.w $t2, $a3, $t0
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    sltu $t2, $t1, $t2
-; LA32-NEXT:    mulh.wu $t3, $a3, $t0
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    mul.w $t3, $a4, $a7
-; LA32-NEXT:    add.w $t1, $t3, $t1
-; LA32-NEXT:    sltu $t3, $t1, $t3
-; LA32-NEXT:    mulh.wu $t4, $a4, $a7
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    add.w $t3, $t2, $t3
-; LA32-NEXT:    mul.w $t4, $a3, $a7
-; LA32-NEXT:    add.w $t5, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    sltu $t2, $t3, $t2
-; LA32-NEXT:    mulh.wu $a3, $a3, $a7
-; LA32-NEXT:    add.w $a3, $a3, $t2
-; LA32-NEXT:    add.w $a3, $a3, $t4
-; LA32-NEXT:    mulh.wu $a7, $a2, $a6
-; LA32-NEXT:    mul.w $t2, $a1, $a6
-; LA32-NEXT:    add.w $a7, $t2, $a7
-; LA32-NEXT:    sltu $t2, $a7, $t2
-; LA32-NEXT:    mulh.wu $t3, $a1, $a6
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    mul.w $t3, $a2, $a5
-; LA32-NEXT:    add.w $a7, $t3, $a7
-; LA32-NEXT:    sltu $t3, $a7, $t3
-; LA32-NEXT:    mulh.wu $t4, $a2, $a5
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    add.w $t3, $t2, $t3
-; LA32-NEXT:    mul.w $t4, $a1, $a5
-; LA32-NEXT:    add.w $t6, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t6, $t4
-; LA32-NEXT:    sltu $t2, $t3, $t2
-; LA32-NEXT:    mulh.wu $a1, $a1, $a5
-; LA32-NEXT:    add.w $a1, $a1, $t2
-; LA32-NEXT:    add.w $a1, $a1, $t4
-; LA32-NEXT:    mul.w $a4, $a4, $t0
-; LA32-NEXT:    mul.w $a2, $a2, $a6
-; LA32-NEXT:    st.w $a2, $a0, 16
-; LA32-NEXT:    st.w $a4, $a0, 0
-; LA32-NEXT:    st.w $a7, $a0, 20
-; LA32-NEXT:    st.w $t1, $a0, 4
-; LA32-NEXT:    st.w $t6, $a0, 24
-; LA32-NEXT:    st.w $t5, $a0, 8
-; LA32-NEXT:    st.w $a1, $a0, 28
-; LA32-NEXT:    st.w $a3, $a0, 12
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_q_du:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 0
-; LA64-NEXT:    mul.d $a5, $a2, $a4
-; LA64-NEXT:    mulh.du $a2, $a2, $a4
-; LA64-NEXT:    mul.d $a4, $a1, $a3
-; LA64-NEXT:    mulh.du $a1, $a1, $a3
-; LA64-NEXT:    st.d $a1, $a0, 24
-; LA64-NEXT:    st.d $a4, $a0, 16
-; LA64-NEXT:    st.d $a2, $a0, 8
-; LA64-NEXT:    st.d $a5, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_q_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.q.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -627,85 +303,13 @@ entry:
 }
 
 define void @vmulwod_q_du(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_q_du:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 6
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 7
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr1, 6
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 3
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 2
-; LA32-NEXT:    mulh.wu $t1, $a4, $t0
-; LA32-NEXT:    mul.w $t2, $a3, $t0
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    sltu $t2, $t1, $t2
-; LA32-NEXT:    mulh.wu $t3, $a3, $t0
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    mul.w $t3, $a4, $a7
-; LA32-NEXT:    add.w $t1, $t3, $t1
-; LA32-NEXT:    sltu $t3, $t1, $t3
-; LA32-NEXT:    mulh.wu $t4, $a4, $a7
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    add.w $t3, $t2, $t3
-; LA32-NEXT:    mul.w $t4, $a3, $a7
-; LA32-NEXT:    add.w $t5, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    sltu $t2, $t3, $t2
-; LA32-NEXT:    mulh.wu $a3, $a3, $a7
-; LA32-NEXT:    add.w $a3, $a3, $t2
-; LA32-NEXT:    add.w $a3, $a3, $t4
-; LA32-NEXT:    mulh.wu $a7, $a2, $a6
-; LA32-NEXT:    mul.w $t2, $a1, $a6
-; LA32-NEXT:    add.w $a7, $t2, $a7
-; LA32-NEXT:    sltu $t2, $a7, $t2
-; LA32-NEXT:    mulh.wu $t3, $a1, $a6
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    mul.w $t3, $a2, $a5
-; LA32-NEXT:    add.w $a7, $t3, $a7
-; LA32-NEXT:    sltu $t3, $a7, $t3
-; LA32-NEXT:    mulh.wu $t4, $a2, $a5
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    add.w $t3, $t2, $t3
-; LA32-NEXT:    mul.w $t4, $a1, $a5
-; LA32-NEXT:    add.w $t6, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t6, $t4
-; LA32-NEXT:    sltu $t2, $t3, $t2
-; LA32-NEXT:    mulh.wu $a1, $a1, $a5
-; LA32-NEXT:    add.w $a1, $a1, $t2
-; LA32-NEXT:    add.w $a1, $a1, $t4
-; LA32-NEXT:    mul.w $a4, $a4, $t0
-; LA32-NEXT:    mul.w $a2, $a2, $a6
-; LA32-NEXT:    st.w $a2, $a0, 16
-; LA32-NEXT:    st.w $a4, $a0, 0
-; LA32-NEXT:    st.w $a7, $a0, 20
-; LA32-NEXT:    st.w $t1, $a0, 4
-; LA32-NEXT:    st.w $t6, $a0, 24
-; LA32-NEXT:    st.w $t5, $a0, 8
-; LA32-NEXT:    st.w $a1, $a0, 28
-; LA32-NEXT:    st.w $a3, $a0, 12
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_q_du:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 1
-; LA64-NEXT:    mul.d $a5, $a2, $a4
-; LA64-NEXT:    mulh.du $a2, $a2, $a4
-; LA64-NEXT:    mul.d $a4, $a1, $a3
-; LA64-NEXT:    mulh.du $a1, $a1, $a3
-; LA64-NEXT:    st.d $a1, $a0, 24
-; LA64-NEXT:    st.d $a4, $a0, 16
-; LA64-NEXT:    st.d $a2, $a0, 8
-; LA64-NEXT:    st.d $a5, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_q_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.q.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -779,117 +383,13 @@ entry:
 }
 
 define void @vmulwev_q_du_d(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_q_du_d:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    st.w $fp, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s0, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 5
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr1, 4
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 0
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 1
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 5
-; LA32-NEXT:    srai.w $t1, $t0, 31
-; LA32-NEXT:    srai.w $t2, $a7, 31
-; LA32-NEXT:    mulh.wu $t3, $a2, $a5
-; LA32-NEXT:    mul.w $t4, $a6, $a5
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t3, $t4
-; LA32-NEXT:    mulh.wu $t5, $a6, $a5
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    mul.w $t5, $a2, $a7
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    sltu $t5, $t3, $t5
-; LA32-NEXT:    mulh.wu $t6, $a2, $a7
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    add.w $t5, $t4, $t5
-; LA32-NEXT:    mul.w $t6, $a6, $a7
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    mul.w $t8, $t2, $a2
-; LA32-NEXT:    add.w $fp, $t7, $t8
-; LA32-NEXT:    sltu $s0, $fp, $t7
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    mulh.wu $a7, $a6, $a7
-; LA32-NEXT:    add.w $a7, $a7, $t4
-; LA32-NEXT:    add.w $a7, $a7, $t6
-; LA32-NEXT:    mul.w $a6, $t2, $a6
-; LA32-NEXT:    mulh.wu $t2, $t2, $a2
-; LA32-NEXT:    add.w $a6, $t2, $a6
-; LA32-NEXT:    add.w $a6, $a6, $t8
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    add.w $a6, $a6, $s0
-; LA32-NEXT:    mulh.wu $a7, $a1, $a4
-; LA32-NEXT:    mul.w $t2, $a3, $a4
-; LA32-NEXT:    add.w $a7, $t2, $a7
-; LA32-NEXT:    sltu $t2, $a7, $t2
-; LA32-NEXT:    mulh.wu $t4, $a3, $a4
-; LA32-NEXT:    add.w $t2, $t4, $t2
-; LA32-NEXT:    mul.w $t4, $a1, $t0
-; LA32-NEXT:    add.w $a7, $t4, $a7
-; LA32-NEXT:    sltu $t4, $a7, $t4
-; LA32-NEXT:    mulh.wu $t5, $a1, $t0
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    add.w $t4, $t2, $t4
-; LA32-NEXT:    mul.w $t5, $a3, $t0
-; LA32-NEXT:    add.w $t6, $t5, $t4
-; LA32-NEXT:    mul.w $t7, $t1, $a1
-; LA32-NEXT:    add.w $t8, $t6, $t7
-; LA32-NEXT:    sltu $s0, $t8, $t6
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    sltu $t2, $t4, $t2
-; LA32-NEXT:    mulh.wu $t0, $a3, $t0
-; LA32-NEXT:    add.w $t0, $t0, $t2
-; LA32-NEXT:    add.w $t0, $t0, $t5
-; LA32-NEXT:    mul.w $a3, $t1, $a3
-; LA32-NEXT:    mulh.wu $t1, $t1, $a1
-; LA32-NEXT:    add.w $a3, $t1, $a3
-; LA32-NEXT:    add.w $a3, $a3, $t7
-; LA32-NEXT:    add.w $a3, $t0, $a3
-; LA32-NEXT:    add.w $a3, $a3, $s0
-; LA32-NEXT:    mul.w $a2, $a2, $a5
-; LA32-NEXT:    mul.w $a1, $a1, $a4
-; LA32-NEXT:    st.w $a1, $a0, 16
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $a7, $a0, 20
-; LA32-NEXT:    st.w $t3, $a0, 4
-; LA32-NEXT:    st.w $t8, $a0, 24
-; LA32-NEXT:    st.w $fp, $a0, 8
-; LA32-NEXT:    st.w $a3, $a0, 28
-; LA32-NEXT:    st.w $a6, $a0, 12
-; LA32-NEXT:    ld.w $s0, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $fp, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 16
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_q_du_d:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 0
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 2
-; LA64-NEXT:    srai.d $a5, $a4, 63
-; LA64-NEXT:    srai.d $a6, $a3, 63
-; LA64-NEXT:    mulh.du $a7, $a2, $a3
-; LA64-NEXT:    mul.d $a6, $a2, $a6
-; LA64-NEXT:    add.d $a6, $a7, $a6
-; LA64-NEXT:    mulh.du $a7, $a1, $a4
-; LA64-NEXT:    mul.d $a5, $a1, $a5
-; LA64-NEXT:    add.d $a5, $a7, $a5
-; LA64-NEXT:    mul.d $a2, $a2, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a4
-; LA64-NEXT:    st.d $a1, $a0, 16
-; LA64-NEXT:    st.d $a2, $a0, 0
-; LA64-NEXT:    st.d $a5, $a0, 24
-; LA64-NEXT:    st.d $a6, $a0, 8
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_q_du_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.q.du.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -963,117 +463,13 @@ entry:
 }
 
 define void @vmulwod_q_du_d(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_q_du_d:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    st.w $fp, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s0, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr1, 6
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 2
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr1, 3
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 7
-; LA32-NEXT:    srai.w $t1, $t0, 31
-; LA32-NEXT:    srai.w $t2, $a7, 31
-; LA32-NEXT:    mulh.wu $t3, $a2, $a5
-; LA32-NEXT:    mul.w $t4, $a6, $a5
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t3, $t4
-; LA32-NEXT:    mulh.wu $t5, $a6, $a5
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    mul.w $t5, $a2, $a7
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    sltu $t5, $t3, $t5
-; LA32-NEXT:    mulh.wu $t6, $a2, $a7
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    add.w $t5, $t4, $t5
-; LA32-NEXT:    mul.w $t6, $a6, $a7
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    mul.w $t8, $t2, $a2
-; LA32-NEXT:    add.w $fp, $t7, $t8
-; LA32-NEXT:    sltu $s0, $fp, $t7
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    mulh.wu $a7, $a6, $a7
-; LA32-NEXT:    add.w $a7, $a7, $t4
-; LA32-NEXT:    add.w $a7, $a7, $t6
-; LA32-NEXT:    mul.w $a6, $t2, $a6
-; LA32-NEXT:    mulh.wu $t2, $t2, $a2
-; LA32-NEXT:    add.w $a6, $t2, $a6
-; LA32-NEXT:    add.w $a6, $a6, $t8
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    add.w $a6, $a6, $s0
-; LA32-NEXT:    mulh.wu $a7, $a1, $a4
-; LA32-NEXT:    mul.w $t2, $a3, $a4
-; LA32-NEXT:    add.w $a7, $t2, $a7
-; LA32-NEXT:    sltu $t2, $a7, $t2
-; LA32-NEXT:    mulh.wu $t4, $a3, $a4
-; LA32-NEXT:    add.w $t2, $t4, $t2
-; LA32-NEXT:    mul.w $t4, $a1, $t0
-; LA32-NEXT:    add.w $a7, $t4, $a7
-; LA32-NEXT:    sltu $t4, $a7, $t4
-; LA32-NEXT:    mulh.wu $t5, $a1, $t0
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    add.w $t4, $t2, $t4
-; LA32-NEXT:    mul.w $t5, $a3, $t0
-; LA32-NEXT:    add.w $t6, $t5, $t4
-; LA32-NEXT:    mul.w $t7, $t1, $a1
-; LA32-NEXT:    add.w $t8, $t6, $t7
-; LA32-NEXT:    sltu $s0, $t8, $t6
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    sltu $t2, $t4, $t2
-; LA32-NEXT:    mulh.wu $t0, $a3, $t0
-; LA32-NEXT:    add.w $t0, $t0, $t2
-; LA32-NEXT:    add.w $t0, $t0, $t5
-; LA32-NEXT:    mul.w $a3, $t1, $a3
-; LA32-NEXT:    mulh.wu $t1, $t1, $a1
-; LA32-NEXT:    add.w $a3, $t1, $a3
-; LA32-NEXT:    add.w $a3, $a3, $t7
-; LA32-NEXT:    add.w $a3, $t0, $a3
-; LA32-NEXT:    add.w $a3, $a3, $s0
-; LA32-NEXT:    mul.w $a2, $a2, $a5
-; LA32-NEXT:    mul.w $a1, $a1, $a4
-; LA32-NEXT:    st.w $a1, $a0, 16
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $a7, $a0, 20
-; LA32-NEXT:    st.w $t3, $a0, 4
-; LA32-NEXT:    st.w $t8, $a0, 24
-; LA32-NEXT:    st.w $fp, $a0, 8
-; LA32-NEXT:    st.w $a3, $a0, 28
-; LA32-NEXT:    st.w $a6, $a0, 12
-; LA32-NEXT:    ld.w $s0, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $fp, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 16
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_q_du_d:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 1
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 3
-; LA64-NEXT:    srai.d $a5, $a4, 63
-; LA64-NEXT:    srai.d $a6, $a3, 63
-; LA64-NEXT:    mulh.du $a7, $a2, $a3
-; LA64-NEXT:    mul.d $a6, $a2, $a6
-; LA64-NEXT:    add.d $a6, $a7, $a6
-; LA64-NEXT:    mulh.du $a7, $a1, $a4
-; LA64-NEXT:    mul.d $a5, $a1, $a5
-; LA64-NEXT:    add.d $a5, $a7, $a5
-; LA64-NEXT:    mul.d $a2, $a2, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a4
-; LA64-NEXT:    st.d $a1, $a0, 16
-; LA64-NEXT:    st.d $a2, $a0, 0
-; LA64-NEXT:    st.d $a5, $a0, 24
-; LA64-NEXT:    st.d $a6, $a0, 8
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_q_du_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.q.du.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -1147,117 +543,13 @@ entry:
 }
 
 define void @vmulwev_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwev_q_du_d_1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    st.w $fp, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s0, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr0, 5
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 5
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr1, 4
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 1
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr1, 0
-; LA32-NEXT:    srai.w $t1, $a7, 31
-; LA32-NEXT:    srai.w $t2, $a6, 31
-; LA32-NEXT:    mulh.wu $t3, $a2, $a4
-; LA32-NEXT:    mul.w $t4, $a6, $a4
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t3, $t4
-; LA32-NEXT:    mulh.wu $t5, $a6, $a4
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    mul.w $t5, $a2, $t0
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    sltu $t5, $t3, $t5
-; LA32-NEXT:    mulh.wu $t6, $a2, $t0
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    add.w $t5, $t4, $t5
-; LA32-NEXT:    mul.w $t6, $a6, $t0
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    mul.w $t8, $a4, $t2
-; LA32-NEXT:    add.w $fp, $t7, $t8
-; LA32-NEXT:    sltu $s0, $fp, $t7
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    mulh.wu $a6, $a6, $t0
-; LA32-NEXT:    add.w $a6, $a6, $t4
-; LA32-NEXT:    add.w $a6, $a6, $t6
-; LA32-NEXT:    mulh.wu $t4, $a4, $t2
-; LA32-NEXT:    add.w $t4, $t4, $t8
-; LA32-NEXT:    mul.w $t0, $t0, $t2
-; LA32-NEXT:    add.w $t0, $t4, $t0
-; LA32-NEXT:    add.w $a6, $a6, $t0
-; LA32-NEXT:    add.w $a6, $a6, $s0
-; LA32-NEXT:    mulh.wu $t0, $a1, $a3
-; LA32-NEXT:    mul.w $t2, $a7, $a3
-; LA32-NEXT:    add.w $t0, $t2, $t0
-; LA32-NEXT:    sltu $t2, $t0, $t2
-; LA32-NEXT:    mulh.wu $t4, $a7, $a3
-; LA32-NEXT:    add.w $t2, $t4, $t2
-; LA32-NEXT:    mul.w $t4, $a1, $a5
-; LA32-NEXT:    add.w $t0, $t4, $t0
-; LA32-NEXT:    sltu $t4, $t0, $t4
-; LA32-NEXT:    mulh.wu $t5, $a1, $a5
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    add.w $t4, $t2, $t4
-; LA32-NEXT:    mul.w $t5, $a7, $a5
-; LA32-NEXT:    add.w $t6, $t5, $t4
-; LA32-NEXT:    mul.w $t7, $a3, $t1
-; LA32-NEXT:    add.w $t8, $t6, $t7
-; LA32-NEXT:    sltu $s0, $t8, $t6
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    sltu $t2, $t4, $t2
-; LA32-NEXT:    mulh.wu $a7, $a7, $a5
-; LA32-NEXT:    add.w $a7, $a7, $t2
-; LA32-NEXT:    add.w $a7, $a7, $t5
-; LA32-NEXT:    mulh.wu $t2, $a3, $t1
-; LA32-NEXT:    add.w $t2, $t2, $t7
-; LA32-NEXT:    mul.w $a5, $a5, $t1
-; LA32-NEXT:    add.w $a5, $t2, $a5
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s0
-; LA32-NEXT:    mul.w $a2, $a2, $a4
-; LA32-NEXT:    mul.w $a1, $a1, $a3
-; LA32-NEXT:    st.w $a1, $a0, 16
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $t0, $a0, 20
-; LA32-NEXT:    st.w $t3, $a0, 4
-; LA32-NEXT:    st.w $t8, $a0, 24
-; LA32-NEXT:    st.w $fp, $a0, 8
-; LA32-NEXT:    st.w $a5, $a0, 28
-; LA32-NEXT:    st.w $a6, $a0, 12
-; LA32-NEXT:    ld.w $s0, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $fp, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 16
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwev_q_du_d_1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 2
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 2
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 0
-; LA64-NEXT:    srai.d $a5, $a2, 63
-; LA64-NEXT:    srai.d $a6, $a1, 63
-; LA64-NEXT:    mulh.du $a7, $a1, $a4
-; LA64-NEXT:    mul.d $a6, $a6, $a4
-; LA64-NEXT:    add.d $a6, $a7, $a6
-; LA64-NEXT:    mulh.du $a7, $a2, $a3
-; LA64-NEXT:    mul.d $a5, $a5, $a3
-; LA64-NEXT:    add.d $a5, $a7, $a5
-; LA64-NEXT:    mul.d $a1, $a1, $a4
-; LA64-NEXT:    mul.d $a2, $a2, $a3
-; LA64-NEXT:    st.d $a2, $a0, 16
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    st.d $a5, $a0, 24
-; LA64-NEXT:    st.d $a6, $a0, 8
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwev_q_du_d_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwev.q.du.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -1331,117 +623,13 @@ entry:
 }
 
 define void @vmulwod_q_du_d_1(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: vmulwod_q_du_d_1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    st.w $fp, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $s0, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 2
-; LA32-NEXT:    xvpickve2gr.w $a6, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a7, $xr0, 7
-; LA32-NEXT:    xvpickve2gr.w $a5, $xr1, 7
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr1, 6
-; LA32-NEXT:    xvpickve2gr.w $t0, $xr1, 3
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr1, 2
-; LA32-NEXT:    srai.w $t1, $a7, 31
-; LA32-NEXT:    srai.w $t2, $a6, 31
-; LA32-NEXT:    mulh.wu $t3, $a2, $a4
-; LA32-NEXT:    mul.w $t4, $a6, $a4
-; LA32-NEXT:    add.w $t3, $t4, $t3
-; LA32-NEXT:    sltu $t4, $t3, $t4
-; LA32-NEXT:    mulh.wu $t5, $a6, $a4
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    mul.w $t5, $a2, $t0
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    sltu $t5, $t3, $t5
-; LA32-NEXT:    mulh.wu $t6, $a2, $t0
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    add.w $t5, $t4, $t5
-; LA32-NEXT:    mul.w $t6, $a6, $t0
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    mul.w $t8, $a4, $t2
-; LA32-NEXT:    add.w $fp, $t7, $t8
-; LA32-NEXT:    sltu $s0, $fp, $t7
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    mulh.wu $a6, $a6, $t0
-; LA32-NEXT:    add.w $a6, $a6, $t4
-; LA32-NEXT:    add.w $a6, $a6, $t6
-; LA32-NEXT:    mulh.wu $t4, $a4, $t2
-; LA32-NEXT:    add.w $t4, $t4, $t8
-; LA32-NEXT:    mul.w $t0, $t0, $t2
-; LA32-NEXT:    add.w $t0, $t4, $t0
-; LA32-NEXT:    add.w $a6, $a6, $t0
-; LA32-NEXT:    add.w $a6, $a6, $s0
-; LA32-NEXT:    mulh.wu $t0, $a1, $a3
-; LA32-NEXT:    mul.w $t2, $a7, $a3
-; LA32-NEXT:    add.w $t0, $t2, $t0
-; LA32-NEXT:    sltu $t2, $t0, $t2
-; LA32-NEXT:    mulh.wu $t4, $a7, $a3
-; LA32-NEXT:    add.w $t2, $t4, $t2
-; LA32-NEXT:    mul.w $t4, $a1, $a5
-; LA32-NEXT:    add.w $t0, $t4, $t0
-; LA32-NEXT:    sltu $t4, $t0, $t4
-; LA32-NEXT:    mulh.wu $t5, $a1, $a5
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    add.w $t4, $t2, $t4
-; LA32-NEXT:    mul.w $t5, $a7, $a5
-; LA32-NEXT:    add.w $t6, $t5, $t4
-; LA32-NEXT:    mul.w $t7, $a3, $t1
-; LA32-NEXT:    add.w $t8, $t6, $t7
-; LA32-NEXT:    sltu $s0, $t8, $t6
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    sltu $t2, $t4, $t2
-; LA32-NEXT:    mulh.wu $a7, $a7, $a5
-; LA32-NEXT:    add.w $a7, $a7, $t2
-; LA32-NEXT:    add.w $a7, $a7, $t5
-; LA32-NEXT:    mulh.wu $t2, $a3, $t1
-; LA32-NEXT:    add.w $t2, $t2, $t7
-; LA32-NEXT:    mul.w $a5, $a5, $t1
-; LA32-NEXT:    add.w $a5, $t2, $a5
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s0
-; LA32-NEXT:    mul.w $a2, $a2, $a4
-; LA32-NEXT:    mul.w $a1, $a1, $a3
-; LA32-NEXT:    st.w $a1, $a0, 16
-; LA32-NEXT:    st.w $a2, $a0, 0
-; LA32-NEXT:    st.w $t0, $a0, 20
-; LA32-NEXT:    st.w $t3, $a0, 4
-; LA32-NEXT:    st.w $t8, $a0, 24
-; LA32-NEXT:    st.w $fp, $a0, 8
-; LA32-NEXT:    st.w $a5, $a0, 28
-; LA32-NEXT:    st.w $a6, $a0, 12
-; LA32-NEXT:    ld.w $s0, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $fp, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 16
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: vmulwod_q_du_d_1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.d $a3, $xr1, 3
-; LA64-NEXT:    xvpickve2gr.d $a4, $xr1, 1
-; LA64-NEXT:    srai.d $a5, $a2, 63
-; LA64-NEXT:    srai.d $a6, $a1, 63
-; LA64-NEXT:    mulh.du $a7, $a1, $a4
-; LA64-NEXT:    mul.d $a6, $a6, $a4
-; LA64-NEXT:    add.d $a6, $a7, $a6
-; LA64-NEXT:    mulh.du $a7, $a2, $a3
-; LA64-NEXT:    mul.d $a5, $a5, $a3
-; LA64-NEXT:    add.d $a5, $a7, $a5
-; LA64-NEXT:    mul.d $a1, $a1, $a4
-; LA64-NEXT:    mul.d $a2, $a2, $a3
-; LA64-NEXT:    st.d $a2, $a0, 16
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    st.d $a5, $a0, 24
-; LA64-NEXT:    st.d $a6, $a0, 8
-; LA64-NEXT:    ret
+; CHECK-LABEL: vmulwod_q_du_d_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvmulwod.q.du.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b



More information about the llvm-branch-commits mailing list