[llvm] 2ec380b - [Hexagon] Improve handling of 32-bit mulh/mul_lohi on HVX

Sat Oct 22 15:24:48 PDT 2022

Author: Krzysztof Parzyszek
Date: 2022-10-22T15:08:27-07:00
New Revision: 2ec380b23f2e7058a1732582f890f9b41fd94175

URL: https://github.com/llvm/llvm-project/commit/2ec380b23f2e7058a1732582f890f9b41fd94175
DIFF: https://github.com/llvm/llvm-project/commit/2ec380b23f2e7058a1732582f890f9b41fd94175.diff

LOG: [Hexagon] Improve handling of 32-bit mulh/mul_lohi on HVX

Handle MULH[US] by normalizing them into newly invented nodes
HexagonISD::(S|U|US)MUL_LOHI. On HVX v60, if only the high part of
SMUL_LOHI is used, use the original MULHS expansion. In all other
cases, expand the full product.
On HVX v62, always expand the full product.

Introduce Hexagon-specific LLVM IR intrinsics for 32x32 multiplication
returning low/high parts.

Added: 
    llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsHexagon.td
    llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
    llvm/lib/Target/Hexagon/HexagonISelLowering.h
    llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
    llvm/lib/Target/Hexagon/HexagonPatterns.td
    llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
    llvm/test/CodeGen/Hexagon/autohvx/mulh.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 52c29ef31f0ad..9c7c43144db5a 100644

--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -381,6 +381,32 @@ Hexagon_NonGCC_Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
 def int_hexagon_V6_pred_typecast_128B :
 Hexagon_NonGCC_Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
 
+// HVX full-precision multiplication.
+// V6_vmpyss_parts(Vu,Vv) = (MulHS(Vu,Vv),  Mul(Vu,Vv))
+// V6_vmpyuu_parts(Vu,Vv) = (MulHU(Vu,Vv),  Mul(Vu,Vv))
+// V6_vmpyus_parts(Vu,Vv) = (MulHUS(Vu,Vv), Mul(Vu,Vv))
+//
+// Both, the (purportedly) 64b and the _128B versions are exactly equivalent
+// regardless of the HVX mode, they are both defined for consistency.
+// The purpose of these intrinsics is to have a uniform way of multiplying two
+// integer vectors in the LLVM IR. Many HVX multiply operations interleave
+// the even-odd results, except for 32x32 multiplications. Also, 
diff erent
+// HVX versions have 
diff erent instructions that can be used, so defer the
+// instruction choice to the isel.
+class Hexagon_vv_vv_pure:
+  Hexagon_NonGCC_Intrinsic<
+    [llvm_anyvector_ty, LLVMMatchType<0>],
+    [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem]>;
+
+def int_hexagon_V6_vmpyss_parts:      Hexagon_vv_vv_pure;
+def int_hexagon_V6_vmpyss_parts_128B: Hexagon_vv_vv_pure;
+def int_hexagon_V6_vmpyuu_parts:      Hexagon_vv_vv_pure;
+def int_hexagon_V6_vmpyuu_parts_128B: Hexagon_vv_vv_pure;
+def int_hexagon_V6_vmpyus_parts:      Hexagon_vv_vv_pure;
+def int_hexagon_V6_vmpyus_parts_128B: Hexagon_vv_vv_pure;
+
+
 // Masked vector stores
 //
 // These are all deprecated, the intrinsics matching instruction names

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 8723ac678bdc8..9bd377ab0a5be 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1904,6 +1904,9 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::MFSHR:         return "HexagonISD::MFSHR";
   case HexagonISD::SSAT:          return "HexagonISD::SSAT";
   case HexagonISD::USAT:          return "HexagonISD::USAT";
+  case HexagonISD::SMUL_LOHI:     return "HexagonISD::SMUL_LOHI";
+  case HexagonISD::UMUL_LOHI:     return "HexagonISD::UMUL_LOHI";
+  case HexagonISD::USMUL_LOHI:    return "HexagonISD::USMUL_LOHI";
   case HexagonISD::VEXTRACTW:     return "HexagonISD::VEXTRACTW";
   case HexagonISD::VINSERTW0:     return "HexagonISD::VINSERTW0";
   case HexagonISD::VROR:          return "HexagonISD::VROR";
@@ -2189,6 +2192,16 @@ HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::TypeSplitVector;
 }
 
+TargetLoweringBase::LegalizeAction
+HexagonTargetLowering::getCustomOperationAction(SDNode &Op) const {
+  if (Subtarget.useHVXOps()) {
+    unsigned Action = getCustomHvxOperationAction(Op);
+    if (Action != ~0u)
+      return static_cast<TargetLoweringBase::LegalizeAction>(Action);
+  }
+  return TargetLoweringBase::Legal;
+}
+
 std::pair<SDValue, int>
 HexagonTargetLowering::getBaseAndOffset(SDValue Addr) const {
   if (Addr.getOpcode() == ISD::ADD) {

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 2c056992d5cad..ac877364f9210 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -61,6 +61,12 @@ enum NodeType : unsigned {
 
   SSAT,        // Signed saturate.
   USAT,        // Unsigned saturate.
+  SMUL_LOHI,   // Same as ISD::SMUL_LOHI, but opaque to the combiner.
+  UMUL_LOHI,   // Same as ISD::UMUL_LOHI, but opaque to the combiner.
+               // We want to legalize MULH[SU] to [SU]MUL_LOHI, but the
+               // combiner will keep rewriting it back to MULH[SU].
+  USMUL_LOHI,  // Like SMUL_LOHI, but unsigned*signed.
+
   TSTBIT,
   INSERT,
   EXTRACTU,
@@ -164,8 +170,8 @@ class HexagonTargetLowering : public TargetLowering {
       unsigned Index) const override;
 
   bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
-      const override;
+  LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+  LegalizeAction getCustomOperationAction(SDNode &Op) const override;
 
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -355,6 +361,7 @@ class HexagonTargetLowering : public TargetLowering {
 private:
   void initializeHVXLowering();
   unsigned getPreferredHvxVectorAction(MVT VecTy) const;
+  unsigned getCustomHvxOperationAction(SDNode &Op) const;
 
   bool validateConstPtrAlignment(SDValue Ptr, Align NeedAlign, const SDLoc &dl,
                                  SelectionDAG &DAG) const;
@@ -485,6 +492,12 @@ class HexagonTargetLowering : public TargetLowering {
                                     bool Signed, SelectionDAG &DAG) const;
   VectorPair emitHvxShiftRightRnd(SDValue Val, unsigned Amt, bool Signed,
                                   SelectionDAG &DAG) const;
+  SDValue emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl,
+                          SelectionDAG &DAG) const;
+  SDValue emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B, bool SignedB,
+                            const SDLoc &dl, SelectionDAG &DAG) const;
+  SDValue emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B, bool SignedB,
+                            const SDLoc &dl, SelectionDAG &DAG) const;
 
   SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const;
@@ -499,6 +512,7 @@ class HexagonTargetLowering : public TargetLowering {
   SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxMulLoHi(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 7364e1822285f..e15508e6038df 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -90,6 +90,7 @@ HexagonTargetLowering::initializeHVXLowering() {
   ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
   ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
   MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8;
+  MVT WordV = Use64b ? MVT::v16i32 : MVT::v32i32;
   MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
 
   auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
@@ -213,8 +214,11 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::LOAD,               T, Custom);
     setOperationAction(ISD::MLOAD,              T, Custom);
     setOperationAction(ISD::MSTORE,             T, Custom);
-    setOperationAction(ISD::MULHS,              T, Custom);
-    setOperationAction(ISD::MULHU,              T, Custom);
+    if (T.getScalarType() != MVT::i32) {
+      setOperationAction(ISD::MULHS,              T, Legal);
+      setOperationAction(ISD::MULHU,              T, Legal);
+    }
+
     setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
     // Make concat-vectors custom to handle concats of more than 2 vectors.
     setOperationAction(ISD::CONCAT_VECTORS,     T, Custom);
@@ -320,6 +324,12 @@ HexagonTargetLowering::initializeHVXLowering() {
     }
   }
 
+  // Legalize all of these to HexagonISD::[SU]MUL_LOHI.
+  setOperationAction(ISD::MULHS,      WordV, Custom); // -> _LOHI
+  setOperationAction(ISD::MULHU,      WordV, Custom); // -> _LOHI
+  setOperationAction(ISD::SMUL_LOHI,  WordV, Custom);
+  setOperationAction(ISD::UMUL_LOHI,  WordV, Custom);
+
   setCondCodeAction(ISD::SETNE,  MVT::v64f16, Expand);
   setCondCodeAction(ISD::SETLE,  MVT::v64f16, Expand);
   setCondCodeAction(ISD::SETGE,  MVT::v64f16, Expand);
@@ -465,6 +475,18 @@ HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const {
   return ~0u;
 }
 
+unsigned
+HexagonTargetLowering::getCustomHvxOperationAction(SDNode &Op) const {
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  case HexagonISD::SMUL_LOHI:
+  case HexagonISD::UMUL_LOHI:
+  case HexagonISD::USMUL_LOHI:
+    return TargetLoweringBase::Custom;
+  }
+  return TargetLoweringBase::Legal;
+}
+
 SDValue
 HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
                               const SDLoc &dl, SelectionDAG &DAG) const {
@@ -1882,144 +1904,62 @@ HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
-  MVT ResTy = ty(Op);
-  assert(ResTy.isVector());
   const SDLoc &dl(Op);
-  SmallVector<int,256> ShuffMask;
+  MVT ResTy = ty(Op);
+  assert(ResTy.getVectorElementType() == MVT::i32);
 
-  MVT ElemTy = ResTy.getVectorElementType();
-  unsigned VecLen = ResTy.getVectorNumElements();
   SDValue Vs = Op.getOperand(0);
   SDValue Vt = Op.getOperand(1);
-  bool IsSigned = Op.getOpcode() == ISD::MULHS;
-
-  if (ElemTy == MVT::i8 || ElemTy == MVT::i16) {
-    // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
-    // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
-    // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
-    // For i16, use V6_vmpyhv, which behaves in an analogous way to
-    // V6_vmpybv: results Lo and Hi are products of even/odd elements
-    // respectively.
-    MVT ExtTy = typeExtElem(ResTy, 2);
-    unsigned MpyOpc = ElemTy == MVT::i8
-        ? (IsSigned ? Hexagon::V6_vmpybv : Hexagon::V6_vmpyubv)
-        : (IsSigned ? Hexagon::V6_vmpyhv : Hexagon::V6_vmpyuhv);
-    SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
-
-    // Discard low halves of the resulting values, collect the high halves.
-    for (unsigned I = 0; I < VecLen; I += 2) {
-      ShuffMask.push_back(I+1);         // Pick even element.
-      ShuffMask.push_back(I+VecLen+1);  // Pick odd element.
-    }
-    VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
-    SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
-    return DAG.getBitcast(ResTy, BS);
-  }
 
-  MVT PairTy = typeJoin({ResTy, ResTy});
+  SDVTList ResTys = DAG.getVTList(ResTy, ResTy);
+  unsigned Opc = Op.getOpcode();
 
-  assert(ElemTy == MVT::i32);
-  SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
+  // On HVX v62+ producing the full product is cheap, so legalize MULH to LOHI.
+  if (Opc == ISD::MULHU)
+    return DAG.getNode(HexagonISD::UMUL_LOHI, dl, ResTys, {Vs, Vt}).getValue(1);
+  if (Opc == ISD::MULHS)
+    return DAG.getNode(HexagonISD::SMUL_LOHI, dl, ResTys, {Vs, Vt}).getValue(1);
 
-  auto LoVec = [&DAG,ResTy,dl] (SDValue Pair) {
-    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResTy, Pair);
-  };
-  auto HiVec = [&DAG,ResTy,dl] (SDValue Pair) {
-    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResTy, Pair);
-  };
-
-  auto MulHS_V60 = [&](SDValue Vs, SDValue Vt) {
-    // mulhs(Vs,Vt) =
-    //   = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32
-    //   = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16
-    //      + Lo(Vs) *us (Hi(Vt)*2^16 + Lo(Vt))] >> 32
-    //   = [Hi(Vs) *s Hi(Vt)*2^32 + Hi(Vs) *su Lo(Vt)*2^16
-    //      + Lo(Vs) *us Vt] >> 32
-    // The low half of Lo(Vs)*Lo(Vt) will be discarded (it's not added to
-    // anything, so it cannot produce any carry over to higher bits),
-    // so everything in [] can be shifted by 16 without loss of precision.
-    //   = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + Lo(Vs)*Vt >> 16] >> 16
-    //   = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + V6_vmpyewuh(Vs,Vt)] >> 16
-    // The final additions need to make sure to properly maintain any
-    // carry-out bits.
-    //
-    //                Hi(Vt) Lo(Vt)
-    //                Hi(Vs) Lo(Vs)
-    //               --------------
-    //                Lo(Vt)*Lo(Vs)  | T0 = V6_vmpyewuh(Vt,Vs) does this,
-    //         Hi(Vt)*Lo(Vs)         |      + dropping the low 16 bits
-    //         Hi(Vs)*Lo(Vt)   | T2
-    //  Hi(Vt)*Hi(Vs)
-
-    SDValue T0 = getInstr(Hexagon::V6_vmpyewuh, dl, ResTy, {Vt, Vs}, DAG);
-    // T1 = get Hi(Vs) into low halves.
-    SDValue T1 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {Vs, S16}, DAG);
-    // P0 = interleaved T1.h*Vt.uh (full precision product)
-    SDValue P0 = getInstr(Hexagon::V6_vmpyhus, dl, PairTy, {T1, Vt}, DAG);
-    // T2 = T1.even(h) * Vt.even(uh), i.e. Hi(Vs)*Lo(Vt)
-    SDValue T2 = LoVec(P0);
-    // We need to add T0+T2, recording the carry-out, which will be 1<<16
-    // added to the final sum.
-    // P1 = interleaved even/odd 32-bit (unsigned) sums of 16-bit halves
-    SDValue P1 = getInstr(Hexagon::V6_vadduhw, dl, PairTy, {T0, T2}, DAG);
-    // P2 = interleaved even/odd 32-bit (signed) sums of 16-bit halves
-    SDValue P2 = getInstr(Hexagon::V6_vaddhw, dl, PairTy, {T0, T2}, DAG);
-    // T3 = full-precision(T0+T2) >> 16
-    // The low halves are added-unsigned, the high ones are added-signed.
-    SDValue T3 = getInstr(Hexagon::V6_vasrw_acc, dl, ResTy,
-                          {HiVec(P2), LoVec(P1), S16}, DAG);
-    SDValue T4 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {Vt, S16}, DAG);
-    // P3 = interleaved Hi(Vt)*Hi(Vs) (full precision),
-    // which is now Lo(T1)*Lo(T4), so we want to keep the even product.
-    SDValue P3 = getInstr(Hexagon::V6_vmpyhv, dl, PairTy, {T1, T4}, DAG);
-    SDValue T5 = LoVec(P3);
-    // Add:
-    SDValue T6 = DAG.getNode(ISD::ADD, dl, ResTy, {T3, T5});
-    return T6;
-  };
+#ifndef NDEBUG
+  Op.dump(&DAG);
+#endif
+  llvm_unreachable("Unexpected mulh operation");
+}
 
-  auto MulHS_V62 = [&](SDValue Vs, SDValue Vt) {
-    MVT PairTy = typeJoin({ResTy, ResTy});
-    SDValue T0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {Vs, Vt}, DAG);
-    SDValue T1 = getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy,
-                          {T0, Vs, Vt}, DAG);
-    return opSplit(T1, dl, DAG).second;
-  };
+SDValue
+HexagonTargetLowering::LowerHvxMulLoHi(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned Opc = Op.getOpcode();
+  SDValue Vu = Op.getOperand(0);
+  SDValue Vv = Op.getOperand(1);
+
+  // If the HI part is not used, convert it to a regular MUL.
+  if (auto HiVal = Op.getValue(1); HiVal.use_empty()) {
+    // Need to preserve the types and the number of values.
+    SDValue Hi = DAG.getUNDEF(ty(HiVal));
+    SDValue Lo = DAG.getNode(ISD::MUL, dl, ty(Op), {Vu, Vv});
+    return DAG.getMergeValues({Lo, Hi}, dl);
+  }
+
+  bool SignedVu = Opc == HexagonISD::SMUL_LOHI;
+  bool SignedVv = Opc == HexagonISD::SMUL_LOHI || Opc == HexagonISD::USMUL_LOHI;
+
+  // Legal on HVX v62+, but lower it here because patterns can't handle multi-
+  // valued nodes.
+  if (Subtarget.useHVXV62Ops())
+    return emitHvxMulLoHiV62(Vu, SignedVu, Vv, SignedVv, dl, DAG);
+
+  if (Opc == HexagonISD::SMUL_LOHI) {
+    // Direct MULHS expansion is cheaper than doing the whole SMUL_LOHI,
+    // for other signedness LOHI is cheaper.
+    if (auto LoVal = Op.getValue(0); LoVal.use_empty()) {
+      SDValue Hi = emitHvxMulHsV60(Vu, Vv, dl, DAG);
+      SDValue Lo = DAG.getUNDEF(ty(LoVal));
+      return DAG.getMergeValues({Lo, Hi}, dl);
+    }
+  }
 
-  if (IsSigned) {
-    if (Subtarget.useHVXV62Ops())
-      return MulHS_V62(Vs, Vt);
-    return MulHS_V60(Vs, Vt);
-  }
-
-  // Unsigned mulhw.
-
-  SDValue P = getInstr(Hexagon::V6_lvsplatw, dl, ResTy,
-                       {DAG.getConstant(0x02020202, dl, MVT::i32)}, DAG);
-  // Multiply-unsigned halfwords:
-  //   LoVec = Vs.uh[2i] * Vt.uh[2i],
-  //   HiVec = Vs.uh[2i+1] * Vt.uh[2i+1]
-  SDValue T0 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, Vt}, DAG);
-  // The low halves in the LoVec of the pair can be discarded. They are
-  // not added to anything (in the full-precision product), so they cannot
-  // produce a carry into the higher bits.
-  SDValue T1 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {LoVec(T0), S16}, DAG);
-  // Swap low and high halves in Vt, and do the halfword multiplication
-  // to get products Vs.uh[2i] * Vt.uh[2i+1] and Vs.uh[2i+1] * Vt.uh[2i].
-  SDValue D0 = getInstr(Hexagon::V6_vdelta, dl, ResTy, {Vt, P}, DAG);
-  SDValue T2 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, D0}, DAG);
-  // T2 has mixed products of halfwords: Lo(Vt)*Hi(Vs) and Hi(Vt)*Lo(Vs).
-  // These products are words, but cannot be added directly because the
-  // sums could overflow. Add these products, by halfwords, where each sum
-  // of a pair of halfwords gives a word.
-  SDValue T3 = getInstr(Hexagon::V6_vadduhw, dl, PairTy,
-                        {LoVec(T2), HiVec(T2)}, DAG);
-  // Add the high halfwords from the products of the low halfwords.
-  SDValue T4 = DAG.getNode(ISD::ADD, dl, ResTy, {T1, LoVec(T3)});
-  SDValue T5 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {T4, S16}, DAG);
-  SDValue T6 = DAG.getNode(ISD::ADD, dl, ResTy, {HiVec(T0), HiVec(T3)});
-  SDValue T7 = DAG.getNode(ISD::ADD, dl, ResTy, {T5, T6});
-  return T7;
+  return emitHvxMulLoHiV60(Vu, SignedVu, Vv, SignedVv, dl, DAG);
 }
 
 SDValue
@@ -2196,21 +2136,38 @@ HexagonTargetLowering::LowerHvxFunnelShift(SDValue Op,
 SDValue
 HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
   const SDLoc &dl(Op);
-  MVT ResTy = ty(Op);
-
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  bool Use64b = Subtarget.useHVX64BOps();
-  unsigned IntPredCast = Use64b ? Intrinsic::hexagon_V6_pred_typecast
-                                : Intrinsic::hexagon_V6_pred_typecast_128B;
-  if (IntNo == IntPredCast) {
-    SDValue Vs = Op.getOperand(1);
-    MVT OpTy = ty(Vs);
-    if (isHvxBoolTy(ResTy) && isHvxBoolTy(OpTy)) {
-      if (ResTy == OpTy)
-        return Vs;
-      return DAG.getNode(HexagonISD::TYPECAST, dl, ResTy, Vs);
+  SmallVector<SDValue> Ops(Op->ops().begin(), Op->ops().end());
+
+  auto Swap = [&](SDValue P) {
+    return DAG.getMergeValues({P.getValue(1), P.getValue(0)}, dl);
+  };
+
+  switch (IntNo) {
+  case Intrinsic::hexagon_V6_pred_typecast:
+  case Intrinsic::hexagon_V6_pred_typecast_128B: {
+    MVT ResTy = ty(Op), InpTy = ty(Ops[1]);
+    if (isHvxBoolTy(ResTy) && isHvxBoolTy(InpTy)) {
+      if (ResTy == InpTy)
+        return Ops[1];
+      return DAG.getNode(HexagonISD::TYPECAST, dl, ResTy, Ops[1]);
     }
+    break;
   }
+  case Intrinsic::hexagon_V6_vmpyss_parts:
+  case Intrinsic::hexagon_V6_vmpyss_parts_128B:
+    return Swap(DAG.getNode(HexagonISD::SMUL_LOHI, dl, Op->getVTList(),
+                            {Ops[1], Ops[2]}));
+  case Intrinsic::hexagon_V6_vmpyuu_parts:
+  case Intrinsic::hexagon_V6_vmpyuu_parts_128B:
+    return Swap(DAG.getNode(HexagonISD::UMUL_LOHI, dl, Op->getVTList(),
+                            {Ops[1], Ops[2]}));
+  case Intrinsic::hexagon_V6_vmpyus_parts:
+  case Intrinsic::hexagon_V6_vmpyus_parts_128B: {
+    return Swap(DAG.getNode(HexagonISD::USMUL_LOHI, dl, Op->getVTList(),
+                            {Ops[1], Ops[2]}));
+  }
+  } // switch
 
   return Op;
 }
@@ -2525,6 +2482,212 @@ HexagonTargetLowering::emitHvxShiftRightRnd(SDValue Val, unsigned Amt,
   return {Mux, Ovf};
 }
 
+SDValue
+HexagonTargetLowering::emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl,
+                                       SelectionDAG &DAG) const {
+  MVT VecTy = ty(A);
+  MVT PairTy = typeJoin({VecTy, VecTy});
+  assert(VecTy.getVectorElementType() == MVT::i32);
+
+  SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
+
+  auto LoVec = [&DAG, VecTy, dl](SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair);
+  };
+  auto HiVec = [&DAG, VecTy, dl](SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair);
+  };
+
+  // mulhs(A,B) =
+  //   = [(Hi(A)*2^16 + Lo(A)) *s (Hi(B)*2^16 + Lo(B))] >> 32
+  //   = [Hi(A)*2^16 *s Hi(B)*2^16 + Hi(A) *su Lo(B)*2^16
+  //      + Lo(A) *us (Hi(B)*2^16 + Lo(B))] >> 32
+  //   = [Hi(A) *s Hi(B)*2^32 + Hi(A) *su Lo(B)*2^16 + Lo(A) *us B] >> 32
+  // The low half of Lo(A)*Lo(B) will be discarded (it's not added to
+  // anything, so it cannot produce any carry over to higher bits),
+  // so everything in [] can be shifted by 16 without loss of precision.
+  //   = [Hi(A) *s Hi(B)*2^16 + Hi(A)*su Lo(B) + Lo(A)*B >> 16] >> 16
+  //   = [Hi(A) *s Hi(B)*2^16 + Hi(A)*su Lo(B) + V6_vmpyewuh(A,B)] >> 16
+  // The final additions need to make sure to properly maintain any carry-
+  // out bits.
+  //
+  //                Hi(B) Lo(B)
+  //                Hi(A) Lo(A)
+  //               --------------
+  //                Lo(B)*Lo(A)  | T0 = V6_vmpyewuh(B,A) does this,
+  //         Hi(B)*Lo(A)         |      + dropping the low 16 bits
+  //         Hi(A)*Lo(B)   | T2
+  //  Hi(B)*Hi(A)
+
+  SDValue T0 = getInstr(Hexagon::V6_vmpyewuh, dl, VecTy, {B, A}, DAG);
+  // T1 = get Hi(A) into low halves.
+  SDValue T1 = getInstr(Hexagon::V6_vasrw, dl, VecTy, {A, S16}, DAG);
+  // P0 = interleaved T1.h*B.uh (full precision product)
+  SDValue P0 = getInstr(Hexagon::V6_vmpyhus, dl, PairTy, {T1, B}, DAG);
+  // T2 = T1.even(h) * B.even(uh), i.e. Hi(A)*Lo(B)
+  SDValue T2 = LoVec(P0);
+  // We need to add T0+T2, recording the carry-out, which will be 1<<16
+  // added to the final sum.
+  // P1 = interleaved even/odd 32-bit (unsigned) sums of 16-bit halves
+  SDValue P1 = getInstr(Hexagon::V6_vadduhw, dl, PairTy, {T0, T2}, DAG);
+  // P2 = interleaved even/odd 32-bit (signed) sums of 16-bit halves
+  SDValue P2 = getInstr(Hexagon::V6_vaddhw, dl, PairTy, {T0, T2}, DAG);
+  // T3 = full-precision(T0+T2) >> 16
+  // The low halves are added-unsigned, the high ones are added-signed.
+  SDValue T3 = getInstr(Hexagon::V6_vasrw_acc, dl, VecTy,
+                        {HiVec(P2), LoVec(P1), S16}, DAG);
+  SDValue T4 = getInstr(Hexagon::V6_vasrw, dl, VecTy, {B, S16}, DAG);
+  // P3 = interleaved Hi(B)*Hi(A) (full precision),
+  // which is now Lo(T1)*Lo(T4), so we want to keep the even product.
+  SDValue P3 = getInstr(Hexagon::V6_vmpyhv, dl, PairTy, {T1, T4}, DAG);
+  SDValue T5 = LoVec(P3);
+  // Add:
+  SDValue T6 = DAG.getNode(ISD::ADD, dl, VecTy, {T3, T5});
+  return T6;
+}
+
+SDValue
+HexagonTargetLowering::emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B,
+                                         bool SignedB, const SDLoc &dl,
+                                         SelectionDAG &DAG) const {
+  MVT VecTy = ty(A);
+  MVT PairTy = typeJoin({VecTy, VecTy});
+  assert(VecTy.getVectorElementType() == MVT::i32);
+
+  SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
+
+  auto LoVec = [&DAG, VecTy, dl](SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair);
+  };
+  auto HiVec = [&DAG, VecTy, dl](SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair);
+  };
+
+  if (SignedA && !SignedB) {
+    // Make A:unsigned, B:signed.
+    std::swap(A, B);
+    std::swap(SignedA, SignedB);
+  }
+
+  // Do halfword-wise multiplications for unsigned*unsigned product, then
+  // add corrections for signed and unsigned*signed.
+
+  SDValue Lo, Hi;
+
+  // P0:lo = (uu) products of low halves of A and B,
+  // P0:hi = (uu) products of high halves.
+  SDValue P0 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {A, B}, DAG);
+
+  // Swap low/high halves in B
+  SDValue T0 = getInstr(Hexagon::V6_lvsplatw, dl, VecTy,
+                        {DAG.getConstant(0x02020202, dl, MVT::i32)}, DAG);
+  SDValue T1 = getInstr(Hexagon::V6_vdelta, dl, VecTy, {B, T0}, DAG);
+  // P1 = products of even/odd halfwords.
+  // P1:lo = (uu) products of even(A.uh) * odd(B.uh)
+  // P1:hi = (uu) products of odd(A.uh) * even(B.uh)
+  SDValue P1 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {A, T1}, DAG);
+
+  // P2:lo = low halves of P1:lo + P1:hi,
+  // P2:hi = high halves of P1:lo + P1:hi.
+  SDValue P2 =
+      getInstr(Hexagon::V6_vadduhw, dl, PairTy, {HiVec(P1), LoVec(P1)}, DAG);
+  // Still need to add the high halves of P0:lo to P2:lo
+  SDValue T2 = getInstr(Hexagon::V6_vlsrw, dl, VecTy, {LoVec(P0), S16}, DAG);
+  SDValue T3 = DAG.getNode(ISD::ADD, dl, VecTy, {LoVec(P2), T2});
+
+  // The high halves of T3 will contribute to the HI part of LOHI.
+  SDValue T4 =
+      getInstr(Hexagon::V6_vasrw_acc, dl, VecTy, {HiVec(P2), T3, S16}, DAG);
+
+  // The low halves of P2 need to be added to high halves of the LO part.
+  Lo = getInstr(Hexagon::V6_vaslw_acc, dl, VecTy, {LoVec(P0), LoVec(P2), S16},
+                DAG);
+  Hi = DAG.getNode(ISD::ADD, dl, VecTy, {HiVec(P0), T4});
+
+  if (SignedA) {
+    assert(SignedB && "Signed A and unsigned B should have been inverted");
+
+    MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements());
+    SDValue Zero = getZero(dl, VecTy, DAG);
+    SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT);
+    SDValue Q1 = DAG.getSetCC(dl, PredTy, B, Zero, ISD::SETLT);
+    SDValue X0 = DAG.getNode(ISD::VSELECT, dl, VecTy, {Q0, B, Zero});
+    SDValue X1 = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q1, X0, A}, DAG);
+    Hi = getInstr(Hexagon::V6_vsubw, dl, VecTy, {Hi, X1}, DAG);
+  } else if (SignedB) {
+    // Same correction as for mulhus:
+    // mulhus(A.uw,B.w) = mulhu(A.uw,B.uw) - (A.w if B < 0)
+    MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements());
+    SDValue Zero = getZero(dl, VecTy, DAG);
+    SDValue Q1 = DAG.getSetCC(dl, PredTy, B, Zero, ISD::SETLT);
+    Hi = getInstr(Hexagon::V6_vsubwq, dl, VecTy, {Q1, Hi, A}, DAG);
+  } else {
+    assert(!SignedA && !SignedB);
+  }
+
+  return DAG.getMergeValues({Lo, Hi}, dl);
+}
+
+SDValue
+HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B,
+                                         bool SignedB, const SDLoc &dl,
+                                         SelectionDAG &DAG) const {
+  MVT VecTy = ty(A);
+  MVT PairTy = typeJoin({VecTy, VecTy});
+  assert(VecTy.getVectorElementType() == MVT::i32);
+
+  auto LoVec = [&DAG, VecTy, dl](SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair);
+  };
+  auto HiVec = [&DAG, VecTy, dl](SDValue Pair) {
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair);
+  };
+
+  if (SignedA && !SignedB) {
+    // Make A:unsigned, B:signed.
+    std::swap(A, B);
+    std::swap(SignedA, SignedB);
+  }
+
+  // Do S*S first, then make corrections for U*S or U*U if needed.
+  SDValue P0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {A, B}, DAG);
+  SDValue P1 =
+      getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy, {P0, A, B}, DAG);
+  SDValue Lo = LoVec(P1);
+  SDValue Hi = HiVec(P1);
+
+  if (!SignedB) {
+    assert(!SignedA && "Signed A and unsigned B should have been inverted");
+    SDValue Zero = getZero(dl, VecTy, DAG);
+    MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements());
+
+    // Mulhu(X, Y) = Mulhs(X, Y) + (X, if Y < 0) + (Y, if X < 0).
+    // def: Pat<(VecI32 (mulhu HVI32:$A, HVI32:$B)),
+    //          (V6_vaddw (HiVec (Muls64O $A, $B)),
+    //                    (V6_vaddwq (V6_vgtw (V6_vd0), $B),
+    //                               (V6_vandvqv (V6_vgtw (V6_vd0), $A), $B),
+    //                               $A))>;
+    SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT);
+    SDValue Q1 = DAG.getSetCC(dl, PredTy, B, Zero, ISD::SETLT);
+    SDValue T0 = getInstr(Hexagon::V6_vandvqv, dl, VecTy, {Q0, B}, DAG);
+    SDValue T1 = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q1, T0, A}, DAG);
+    Hi = getInstr(Hexagon::V6_vaddw, dl, VecTy, {Hi, T1}, DAG);
+  } else if (!SignedA) {
+    SDValue Zero = getZero(dl, VecTy, DAG);
+    MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements());
+
+    // Mulhus(unsigned X, signed Y) = Mulhs(X, Y) + (Y, if X < 0).
+    // def: Pat<(VecI32 (HexagonMULHUS HVI32:$A, HVI32:$B)),
+    //          (V6_vaddwq (V6_vgtw (V6_vd0), $A),
+    //                     (HiVec (Muls64O $A, $B)),
+    //                     $B)>;
+    SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT);
+    Hi = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q0, Hi, B}, DAG);
+  }
+
+  return DAG.getMergeValues({Lo, Hi}, dl);
+}
+
 SDValue
 HexagonTargetLowering::EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG)
       const {
@@ -3084,6 +3247,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::FSHR:                    return LowerHvxFunnelShift(Op, DAG);
     case ISD::MULHS:
     case ISD::MULHU:                   return LowerHvxMulh(Op, DAG);
+    case ISD::SMUL_LOHI:
+    case ISD::UMUL_LOHI:               return LowerHvxMulLoHi(Op, DAG);
     case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG);
     case ISD::SETCC:
     case ISD::INTRINSIC_VOID:          return Op;
@@ -3097,6 +3262,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::FP_TO_UINT:              return LowerHvxFpToInt(Op, DAG);
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:              return LowerHvxIntToFp(Op, DAG);
+
+    // Special nodes:
+    case HexagonISD::SMUL_LOHI:
+    case HexagonISD::UMUL_LOHI:
+    case HexagonISD::USMUL_LOHI:       return LowerHvxMulLoHi(Op, DAG);
   }
 #ifndef NDEBUG
   Op.dumpr(&DAG);

diff  --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 34e41a4e75994..bf6303fd165e9 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -97,6 +97,7 @@ def HexagonPTRUE:      SDNode<"HexagonISD::PTRUE",      SDTVecLeaf>;
 def HexagonPFALSE:     SDNode<"HexagonISD::PFALSE",     SDTVecLeaf>;
 def HexagonVALIGN:     SDNode<"HexagonISD::VALIGN",     SDTVecVecIntOp>;
 def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
+def HexagonMULHUS:     SDNode<"HexagonISD::MULHUS",     SDTIntBinOp>;
 
 def SDTSaturate:
   SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>;

diff  --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 4519c850278e8..4a086beccb7c6 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -912,6 +912,20 @@ let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
            (V6_pred_not (V6_veqw HvxVR:$Vt, HvxVR:$Vu))>;
 }
 
+// Multiply high for non-i32 types
+def: Pat<(VecI8  (mulhs  HVI8:$Vu,  HVI8:$Vv)),
+         (V6_vshuffob (HiVec (V6_vmpybv $Vu, $Vv)),
+                      (LoVec (V6_vmpybv $Vu, $Vv)))>;
+def: Pat<(VecI16 (mulhs HVI16:$Vu, HVI16:$Vv)),
+         (V6_vshufoh (HiVec (V6_vmpyhv $Vu, $Vv)),
+                     (LoVec (V6_vmpyhv $Vu, $Vv)))>;
+def: Pat<(VecI8  (mulhu  HVI8:$Vu,  HVI8:$Vv)),
+         (V6_vshuffob (HiVec (V6_vmpyubv $Vu, $Vv)),
+                      (LoVec (V6_vmpyubv $Vu, $Vv)))>;
+def: Pat<(VecI16 (mulhu HVI16:$Vu, HVI16:$Vv)),
+         (V6_vshufoh (HiVec (V6_vmpyuhv $Vu, $Vv)),
+                     (LoVec (V6_vmpyuhv $Vu, $Vv)))>;
+
 let Predicates = [UseHVXV60] in {
   // V60 doesn't have vabsb or byte shifts.
   // Do the "mask = x >> width-1; abs = (x + mask) ^ mask" trick.

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll
index 8c9f6a1ad4701..58e8cac9dfe5c 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll
@@ -86,19 +86,16 @@ define <32 x i32> @mulhu(<32 x i32> %a0, <32 x i32> %a1) #0 {
 ; V60-NEXT:     v1:0.uw = vmpy(v0.uh,v1.uh)
 ; V60-NEXT:    }
 ; V60-NEXT:    {
-; V60-NEXT:     v1:0.w = vadd(v0.uh,v1.uh)
+; V60-NEXT:     v1:0.w = vadd(v1.uh,v0.uh)
 ; V60-NEXT:    }
 ; V60-NEXT:    {
-; V60-NEXT:     v0.w = vadd(v2.w,v0.w)
+; V60-NEXT:     v0.w = vadd(v0.w,v2.w)
 ; V60-NEXT:    }
 ; V60-NEXT:    {
-; V60-NEXT:     v1.w = vadd(v3.w,v1.w)
+; V60-NEXT:     v1.w += vasr(v0.w,r2)
 ; V60-NEXT:    }
 ; V60-NEXT:    {
-; V60-NEXT:     v0.uw = vlsr(v0.uw,r2)
-; V60-NEXT:    }
-; V60-NEXT:    {
-; V60-NEXT:     v0.w = vadd(v0.w,v1.w)
+; V60-NEXT:     v0.w = vadd(v3.w,v1.w)
 ; V60-NEXT:    }
 ; V60-NEXT:    {
 ; V60-NEXT:     jumpr r31
@@ -107,40 +104,28 @@ define <32 x i32> @mulhu(<32 x i32> %a0, <32 x i32> %a1) #0 {
 ; V65-LABEL: mulhu:
 ; V65:       // %bb.0:
 ; V65-NEXT:    {
-; V65-NEXT:     r0 = ##33686018
-; V65-NEXT:    }
-; V65-NEXT:    {
-; V65-NEXT:     v3:2.uw = vmpy(v0.uh,v1.uh)
-; V65-NEXT:    }
-; V65-NEXT:    {
-; V65-NEXT:     r2 = #16
-; V65-NEXT:    }
-; V65-NEXT:    {
-; V65-NEXT:     v4 = vsplat(r0)
-; V65-NEXT:    }
-; V65-NEXT:    {
-; V65-NEXT:     v2.uw = vlsr(v2.uw,r2)
+; V65-NEXT:     v2 = vxor(v2,v2)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v1 = vdelta(v1,v4)
+; V65-NEXT:     v5:4 = vmpye(v0.w,v1.uh)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v1:0.uw = vmpy(v0.uh,v1.uh)
+; V65-NEXT:     q0 = vcmp.gt(v2.w,v0.w)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v1:0.w = vadd(v0.uh,v1.uh)
+; V65-NEXT:     q1 = vcmp.gt(v2.w,v1.w)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v0.w = vadd(v2.w,v0.w)
+; V65-NEXT:     v5:4 += vmpyo(v0.w,v1.h)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v1.w = vadd(v3.w,v1.w)
+; V65-NEXT:     v31 = vand(q0,v1)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v0.uw = vlsr(v0.uw,r2)
+; V65-NEXT:     if (q1) v31.w += v0.w
 ; V65-NEXT:    }
 ; V65-NEXT:    {
-; V65-NEXT:     v0.w = vadd(v0.w,v1.w)
+; V65-NEXT:     v0.w = vadd(v5.w,v31.w)
 ; V65-NEXT:    }
 ; V65-NEXT:    {
 ; V65-NEXT:     jumpr r31

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll
new file mode 100644
index 0000000000000..e03bf942e44b3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll
@@ -0,0 +1,570 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon -mattr=+hvxv60,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V60 %s
+; RUN: llc -march=hexagon -mattr=+hvxv62,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V62 %s
+
+define <32 x i32> @f0(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f0:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     r0 = #16
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w = vmpye(v1.w,v0.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0.w = vasr(v0.w,r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3.w = vasr(v1.w,r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5:4.w = vmpy(v0.h,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v31:30.w = vmpy(v0.h,v3.h)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v7:6.w = vadd(v2.uh,v4.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v29:28.w = vadd(v2.h,v4.h)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v29.w += vasr(v6.w,r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0.w = vadd(v29.w,v30.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f0:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 = vmpye(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 += vmpyo(v0.w,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v0 = v3
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0
+  ret <32 x i32> %v1
+}
+
+define <32 x i32> @f1(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f1:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     v2.w = vmpyieo(v0.h,v1.h)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w += vmpyie(v0.w,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0 = v2
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f1:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v2.w = vmpyieo(v0.h,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v2.w += vmpyie(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v0 = v2
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1
+  ret <32 x i32> %v1
+}
+
+define <32 x i32> @f2(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f2:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     r1:0 = combine(#16,##33686018)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3:2.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v30 = vxor(v30,v30)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vsplat(r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     q0 = vcmp.gt(v30.w,v1.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.uw = vlsr(v2.uw,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vdelta(v1,v4)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5:4.uw = vmpy(v0.uh,v4.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5:4.w = vadd(v5.uh,v4.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w = vadd(v4.w,v2.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5.w += vasr(v2.w,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v31.w = vadd(v3.w,v5.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     if (q0) v31.w -= v0.w
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0 = v31
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f2:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 = vmpye(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v4 = vxor(v4,v4)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     q0 = vcmp.gt(v4.w,v0.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 += vmpyo(v0.w,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     if (q0) v3.w += v1.w
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v0 = v3
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0
+  ret <32 x i32> %v1
+}
+
+define <32 x i32> @f3(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f3:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     v2.w = vmpyieo(v0.h,v1.h)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w += vmpyie(v0.w,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0 = v2
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f3:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v2.w = vmpyieo(v0.h,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v2.w += vmpyie(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v0 = v2
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1
+  ret <32 x i32> %v1
+}
+
+define <32 x i32> @f4(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f4:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     r0 = ##33686018
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3:2.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     r2 = #16
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vsplat(r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.uw = vlsr(v2.uw,r2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1 = vdelta(v1,v4)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0.w = vadd(v1.uh,v0.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0.w = vadd(v0.w,v2.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1.w += vasr(v0.w,r2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0.w = vadd(v3.w,v1.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f4:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v2 = vxor(v2,v2)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v5:4 = vmpye(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     q0 = vcmp.gt(v2.w,v0.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     q1 = vcmp.gt(v2.w,v1.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v5:4 += vmpyo(v0.w,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v31 = vand(q0,v1)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     if (q1) v31.w += v0.w
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v0.w = vadd(v5.w,v31.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0
+  ret <32 x i32> %v1
+}
+
+define <32 x i32> @f5(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f5:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     v2.w = vmpyieo(v0.h,v1.h)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w += vmpyie(v0.w,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0 = v2
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f5:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v2.w = vmpyieo(v0.h,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v2.w += vmpyie(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v0 = v2
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1
+  ret <32 x i32> %v1
+}
+
+define <64 x i32> @f10(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f10:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     r0 = ##33686018
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3:2.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     r2 = #16
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vxor(v4,v4)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5 = vsplat(r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     q1 = vcmp.gt(v4.w,v0.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     q0 = vcmp.gt(v4.w,v1.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v6.uw = vlsr(v2.uw,r2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5 = vdelta(v1,v5)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1 = vmux(q1,v1,v4)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     if (q0) v1.w += v0.w
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v9:8.uw = vmpy(v0.uh,v5.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v9:8.w = vadd(v9.uh,v8.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v31.w = vadd(v8.w,v6.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w += vasl(v8.w,r2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v9.w += vasr(v31.w,r2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v0.w = vadd(v3.w,v9.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3.w = vsub(v0.w,v1.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0 = vcombine(v3,v2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f10:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 = vmpye(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 += vmpyo(v0.w,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v1:0 = vcombine(v3,v2)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0
+  %v2 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1
+  %v3 = shufflevector <32 x i32> %v2, <32 x i32> %v1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  ret <64 x i32> %v3
+}
+
+define <64 x i32> @f11(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f11:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     r1:0 = combine(#16,##33686018)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3:2.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v31 = vxor(v31,v31)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vsplat(r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     q0 = vcmp.gt(v31.w,v1.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5.uw = vlsr(v2.uw,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vdelta(v1,v4)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v7:6.uw = vmpy(v0.uh,v4.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v7:6.w = vadd(v7.uh,v6.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v30.w = vadd(v6.w,v5.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w += vasl(v6.w,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v7.w += vasr(v30.w,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3.w = vadd(v3.w,v7.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     if (q0) v3.w -= v0.w
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0 = vcombine(v3,v2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f11:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 = vmpye(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v4 = vxor(v4,v4)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     q0 = vcmp.gt(v4.w,v0.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 += vmpyo(v0.w,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     if (q0) v3.w += v1.w
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v1:0 = vcombine(v3,v2)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0
+  %v2 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1
+  %v3 = shufflevector <32 x i32> %v2, <32 x i32> %v1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  ret <64 x i32> %v3
+}
+
+define <64 x i32> @f12(<32 x i32> %a0, <32 x i32> %a1) #0 {
+; V60-LABEL: f12:
+; V60:       // %bb.0:
+; V60-NEXT:    {
+; V60-NEXT:     r1:0 = combine(#16,##33686018)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3:2.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v4 = vsplat(r0)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v5.uw = vlsr(v2.uw,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1 = vdelta(v1,v4)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0.uw = vmpy(v0.uh,v1.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0.w = vadd(v1.uh,v0.uh)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v31.w = vadd(v0.w,v5.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v2.w += vasl(v0.w,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1.w += vasr(v31.w,r1)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v3.w = vadd(v3.w,v1.w)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     v1:0 = vcombine(v3,v2)
+; V60-NEXT:    }
+; V60-NEXT:    {
+; V60-NEXT:     jumpr r31
+; V60-NEXT:    }
+;
+; V62-LABEL: f12:
+; V62:       // %bb.0:
+; V62-NEXT:    {
+; V62-NEXT:     v4 = vxor(v4,v4)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 = vmpye(v0.w,v1.uh)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     q0 = vcmp.gt(v4.w,v0.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     q1 = vcmp.gt(v4.w,v1.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3:2 += vmpyo(v0.w,v1.h)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v31 = vand(q0,v1)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     if (q1) v31.w += v0.w
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v3.w = vadd(v3.w,v31.w)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     v1:0 = vcombine(v3,v2)
+; V62-NEXT:    }
+; V62-NEXT:    {
+; V62-NEXT:     jumpr r31
+; V62-NEXT:    }
+  %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32> %a0, <32 x i32> %a1)
+  %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0
+  %v2 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1
+  %v3 = shufflevector <32 x i32> %v2, <32 x i32> %v1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  ret <64 x i32> %v3
+}
+
+declare {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32>, <32 x i32>)
+declare {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32>, <32 x i32>)
+declare {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32>, <32 x i32>)
+
+attributes #0 = { nounwind }