[llvm] cdc7864 - [SystemZ] Optimize widening and high-word vector multiplication
Ulrich Weigand via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 15 10:29:21 PDT 2025
Author: Ulrich Weigand
Date: 2025-03-15T18:28:44+01:00
New Revision: cdc786498650c028c9c8ce23dfbff81bf2780d14
URL: https://github.com/llvm/llvm-project/commit/cdc786498650c028c9c8ce23dfbff81bf2780d14
DIFF: https://github.com/llvm/llvm-project/commit/cdc786498650c028c9c8ce23dfbff81bf2780d14.diff
LOG: [SystemZ] Optimize widening and high-word vector multiplication
Detect (non-intrinsic) IR patterns corresponding to the semantics
of the various widening and high-word multiplication instructions.
Specifically, this is done by:
- Recognizing even/odd widening multiplication patterns in DAGCombine
- Recognizing widening multiply-and-add on top during ISel
- Implementing the standard MULHS/MUHLU IR opcodes
- Detecting high-word multiply-and-add (which common code does not)
Depending on architecture level, this can support all integer
vector types as well as the scalar i128 type.
Fixes: https://github.com/llvm/llvm-project/issues/129705
Added:
llvm/test/CodeGen/SystemZ/vec-mul-07.ll
llvm/test/CodeGen/SystemZ/vec-mul-08.ll
llvm/test/CodeGen/SystemZ/vec-mul-09.ll
llvm/test/CodeGen/SystemZ/vec-mul-10.ll
llvm/test/CodeGen/SystemZ/vec-mul-11.ll
llvm/test/CodeGen/SystemZ/vec-mul-12.ll
llvm/test/CodeGen/SystemZ/vec-mul-13.ll
llvm/test/CodeGen/SystemZ/vec-mul-14.ll
Modified:
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
llvm/lib/Target/SystemZ/SystemZISelLowering.h
llvm/lib/Target/SystemZ/SystemZInstrVector.td
llvm/lib/Target/SystemZ/SystemZOperators.td
llvm/test/CodeGen/SystemZ/int-mul-16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f87307030651a..d16d35a4ffb7b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -454,8 +454,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
- if (VT != MVT::v2i64 || Subtarget.hasVectorEnhancements3())
+ if (VT != MVT::v2i64 || Subtarget.hasVectorEnhancements3()) {
setOperationAction(ISD::MUL, VT, Legal);
+ setOperationAction(ISD::MULHS, VT, Legal);
+ setOperationAction(ISD::MULHU, VT, Legal);
+ }
if (Subtarget.hasVectorEnhancements3() &&
VT != MVT::v16i8 && VT != MVT::v8i16) {
setOperationAction(ISD::SDIV, VT, Legal);
@@ -775,6 +778,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
ISD::STRICT_FP_EXTEND,
ISD::BSWAP,
ISD::SETCC,
+ ISD::SRL,
+ ISD::SRA,
+ ISD::MUL,
ISD::SDIV,
ISD::UDIV,
ISD::SREM,
@@ -5345,6 +5351,94 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::s390_vsbcbiq:
return DAG.getNode(SystemZISD::VSBCBI, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::s390_vmhb:
+ case Intrinsic::s390_vmhh:
+ case Intrinsic::s390_vmhf:
+ case Intrinsic::s390_vmhg:
+ case Intrinsic::s390_vmhq:
+ return DAG.getNode(ISD::MULHS, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::s390_vmlhb:
+ case Intrinsic::s390_vmlhh:
+ case Intrinsic::s390_vmlhf:
+ case Intrinsic::s390_vmlhg:
+ case Intrinsic::s390_vmlhq:
+ return DAG.getNode(ISD::MULHU, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::s390_vmahb:
+ case Intrinsic::s390_vmahh:
+ case Intrinsic::s390_vmahf:
+ case Intrinsic::s390_vmahg:
+ case Intrinsic::s390_vmahq:
+ return DAG.getNode(SystemZISD::VMAH, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::s390_vmalhb:
+ case Intrinsic::s390_vmalhh:
+ case Intrinsic::s390_vmalhf:
+ case Intrinsic::s390_vmalhg:
+ case Intrinsic::s390_vmalhq:
+ return DAG.getNode(SystemZISD::VMALH, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::s390_vmeb:
+ case Intrinsic::s390_vmeh:
+ case Intrinsic::s390_vmef:
+ case Intrinsic::s390_vmeg:
+ return DAG.getNode(SystemZISD::VME, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::s390_vmleb:
+ case Intrinsic::s390_vmleh:
+ case Intrinsic::s390_vmlef:
+ case Intrinsic::s390_vmleg:
+ return DAG.getNode(SystemZISD::VMLE, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::s390_vmob:
+ case Intrinsic::s390_vmoh:
+ case Intrinsic::s390_vmof:
+ case Intrinsic::s390_vmog:
+ return DAG.getNode(SystemZISD::VMO, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::s390_vmlob:
+ case Intrinsic::s390_vmloh:
+ case Intrinsic::s390_vmlof:
+ case Intrinsic::s390_vmlog:
+ return DAG.getNode(SystemZISD::VMLO, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::s390_vmaeb:
+ case Intrinsic::s390_vmaeh:
+ case Intrinsic::s390_vmaef:
+ case Intrinsic::s390_vmaeg:
+ return DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(),
+ DAG.getNode(SystemZISD::VME, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2)),
+ Op.getOperand(3));
+ case Intrinsic::s390_vmaleb:
+ case Intrinsic::s390_vmaleh:
+ case Intrinsic::s390_vmalef:
+ case Intrinsic::s390_vmaleg:
+ return DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(),
+ DAG.getNode(SystemZISD::VMLE, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2)),
+ Op.getOperand(3));
+ case Intrinsic::s390_vmaob:
+ case Intrinsic::s390_vmaoh:
+ case Intrinsic::s390_vmaof:
+ case Intrinsic::s390_vmaog:
+ return DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(),
+ DAG.getNode(SystemZISD::VMO, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2)),
+ Op.getOperand(3));
+ case Intrinsic::s390_vmalob:
+ case Intrinsic::s390_vmaloh:
+ case Intrinsic::s390_vmalof:
+ case Intrinsic::s390_vmalog:
+ return DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(),
+ DAG.getNode(SystemZISD::VMLO, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2)),
+ Op.getOperand(3));
}
return SDValue();
@@ -6912,6 +7006,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(VSBI);
OPCODE(VACCC);
OPCODE(VSBCBI);
+ OPCODE(VMAH);
+ OPCODE(VMALH);
+ OPCODE(VME);
+ OPCODE(VMLE);
+ OPCODE(VMO);
+ OPCODE(VMLO);
OPCODE(VICMPE);
OPCODE(VICMPH);
OPCODE(VICMPHL);
@@ -8311,6 +8411,200 @@ SDValue SystemZTargetLowering::combineIntDIVREM(
return SDValue();
}
+
+// Transform a right shift of a multiply-and-add into a multiply-and-add-high.
+// This is closely modeled after the common-code combineShiftToMULH.
+SDValue SystemZTargetLowering::combineShiftToMulAddHigh(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ "SRL or SRA node is required here!");
+
+ if (!Subtarget.hasVector())
+ return SDValue();
+
+ // Check the shift amount. Proceed with the transformation if the shift
+ // amount is constant.
+ ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
+ if (!ShiftAmtSrc)
+ return SDValue();
+
+ // The operation feeding into the shift must be an add.
+ SDValue ShiftOperand = N->getOperand(0);
+ if (ShiftOperand.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // One operand of the add must be a multiply.
+ SDValue MulOp = ShiftOperand.getOperand(0);
+ SDValue AddOp = ShiftOperand.getOperand(1);
+ if (MulOp.getOpcode() != ISD::MUL) {
+ if (AddOp.getOpcode() != ISD::MUL)
+ return SDValue();
+ std::swap(MulOp, AddOp);
+ }
+
+ // All operands must be equivalent extend nodes.
+ SDValue LeftOp = MulOp.getOperand(0);
+ SDValue RightOp = MulOp.getOperand(1);
+
+ bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
+ bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
+
+ if (!IsSignExt && !IsZeroExt)
+ return SDValue();
+
+ EVT NarrowVT = LeftOp.getOperand(0).getValueType();
+ unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
+
+ SDValue MulhRightOp;
+ if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+ unsigned ActiveBits = IsSignExt
+ ? Constant->getAPIntValue().getSignificantBits()
+ : Constant->getAPIntValue().getActiveBits();
+ if (ActiveBits > NarrowVTSize)
+ return SDValue();
+ MulhRightOp = DAG.getConstant(
+ Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+ NarrowVT);
+ } else {
+ if (LeftOp.getOpcode() != RightOp.getOpcode())
+ return SDValue();
+ // Check that the two extend nodes are the same type.
+ if (NarrowVT != RightOp.getOperand(0).getValueType())
+ return SDValue();
+ MulhRightOp = RightOp.getOperand(0);
+ }
+
+ SDValue MulhAddOp;
+ if (ConstantSDNode *Constant = isConstOrConstSplat(AddOp)) {
+ unsigned ActiveBits = IsSignExt
+ ? Constant->getAPIntValue().getSignificantBits()
+ : Constant->getAPIntValue().getActiveBits();
+ if (ActiveBits > NarrowVTSize)
+ return SDValue();
+ MulhAddOp = DAG.getConstant(
+ Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+ NarrowVT);
+ } else {
+ if (LeftOp.getOpcode() != AddOp.getOpcode())
+ return SDValue();
+ // Check that the two extend nodes are the same type.
+ if (NarrowVT != AddOp.getOperand(0).getValueType())
+ return SDValue();
+ MulhAddOp = AddOp.getOperand(0);
+ }
+
+ EVT WideVT = LeftOp.getValueType();
+ // Proceed with the transformation if the wide types match.
+ assert((WideVT == RightOp.getValueType()) &&
+ "Cannot have a multiply node with two
diff erent operand types.");
+ assert((WideVT == AddOp.getValueType()) &&
+ "Cannot have an add node with two
diff erent operand types.");
+
+ // Proceed with the transformation if the wide type is twice as large
+ // as the narrow type.
+ if (WideVT.getScalarSizeInBits() != 2 * NarrowVTSize)
+ return SDValue();
+
+ // Check the shift amount with the narrow type size.
+ // Proceed with the transformation if the shift amount is the width
+ // of the narrow type.
+ unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
+ if (ShiftAmt != NarrowVTSize)
+ return SDValue();
+
+ // Proceed if we support the multiply-and-add-high operation.
+ if (!(NarrowVT == MVT::v16i8 || NarrowVT == MVT::v8i16 ||
+ NarrowVT == MVT::v4i32 ||
+ (Subtarget.hasVectorEnhancements3() &&
+ (NarrowVT == MVT::v2i64 || NarrowVT == MVT::i128))))
+ return SDValue();
+
+ // Emit the VMAH (signed) or VMALH (unsigned) operation.
+ SDValue Result = DAG.getNode(IsSignExt ? SystemZISD::VMAH : SystemZISD::VMALH,
+ DL, NarrowVT, LeftOp.getOperand(0),
+ MulhRightOp, MulhAddOp);
+ bool IsSigned = N->getOpcode() == ISD::SRA;
+ return DAG.getExtOrTrunc(IsSigned, Result, DL, WideVT);
+}
+
+// Op is an operand of a multiplication. Check whether this can be folded
+// into an even/odd widening operation; if so, return the opcode to be used
+// and update Op to the appropriate sub-operand. Note that the caller must
+// verify that *both* operands of the multiplication support the operation.
+static unsigned detectEvenOddMultiplyOperand(const SelectionDAG &DAG,
+ const SystemZSubtarget &Subtarget,
+ SDValue &Op) {
+ EVT VT = Op.getValueType();
+
+ // Check for (sign/zero_extend_vector_inreg (vector_shuffle)) corresponding
+ // to selecting the even or odd vector elements.
+ if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
+ Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG)) {
+ bool IsSigned = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
+ unsigned NumElts = VT.getVectorNumElements();
+ Op = Op.getOperand(0);
+ if (Op.getValueType().getVectorNumElements() == 2 * NumElts &&
+ Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+ bool CanUseEven = true, CanUseOdd = true;
+ for (unsigned Elt = 0; Elt < NumElts; Elt++) {
+ if (ShuffleMask[Elt] == -1)
+ continue;
+ if (unsigned(ShuffleMask[Elt]) != 2 * Elt)
+ CanUseEven = false;
+ if (unsigned(ShuffleMask[Elt]) != 2 * Elt + 1)
+ CanUseEven = true;
+ }
+ Op = Op.getOperand(0);
+ if (CanUseEven)
+ return IsSigned ? SystemZISD::VME : SystemZISD::VMLE;
+ if (CanUseOdd)
+ return IsSigned ? SystemZISD::VMO : SystemZISD::VMLO;
+ }
+ }
+
+ // For arch15, we can also support the v2i64->i128 case, which looks like
+ // (sign/zero_extend (extract_vector_elt X 0/1))
+ if (VT == MVT::i128 && Subtarget.hasVectorEnhancements3() &&
+ (Op.getOpcode() == ISD::SIGN_EXTEND ||
+ Op.getOpcode() == ISD::ZERO_EXTEND)) {
+ bool IsSigned = Op.getOpcode() == ISD::SIGN_EXTEND;
+ Op = Op.getOperand(0);
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op.getOperand(0).getValueType() == MVT::v2i64 &&
+ Op.getOperand(1).getOpcode() == ISD::Constant) {
+ unsigned Elem = Op.getConstantOperandVal(1);
+ Op = Op.getOperand(0);
+ if (Elem == 0)
+ return IsSigned ? SystemZISD::VME : SystemZISD::VMLE;
+ if (Elem == 1)
+ return IsSigned ? SystemZISD::VMO : SystemZISD::VMLO;
+ }
+ }
+
+ return 0;
+}
+
+SDValue SystemZTargetLowering::combineMUL(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Detect even/odd widening multiplication.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ unsigned OpcodeCand0 = detectEvenOddMultiplyOperand(DAG, Subtarget, Op0);
+ unsigned OpcodeCand1 = detectEvenOddMultiplyOperand(DAG, Subtarget, Op1);
+ if (OpcodeCand0 && OpcodeCand0 == OpcodeCand1)
+ return DAG.getNode(OpcodeCand0, SDLoc(N), N->getValueType(0), Op0, Op1);
+
+ return SDValue();
+}
+
SDValue SystemZTargetLowering::combineINTRINSIC(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -8370,6 +8664,9 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI);
case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
case SystemZISD::GET_CCMASK: return combineGET_CCMASK(N, DCI);
+ case ISD::SRL:
+ case ISD::SRA: return combineShiftToMulAddHigh(N, DCI);
+ case ISD::MUL: return combineMUL(N, DCI);
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 32cd9d5aa6733..acdb8cb4cb842 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -234,6 +234,11 @@ enum NodeType : unsigned {
// Compute carry/borrow indication for add/subtract with carry/borrow.
VACCC, VSBCBI,
+ // High-word multiply-and-add.
+ VMAH, VMALH,
+ // Widen and multiply even/odd vector elements.
+ VME, VMLE, VMO, VMLO,
+
// Compare integer vector operands 0 and 1 to produce the usual 0/-1
// vector result. VICMPE is for equality, VICMPH for "signed greater than"
// and VICMPHL for "unsigned greater than".
@@ -759,6 +764,8 @@ class SystemZTargetLowering : public TargetLowering {
SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineShiftToMulAddHigh(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineINTRINSIC(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 03588906159d7..3e78b3d175f4b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -714,86 +714,86 @@ let Predicates = [FeatureVector] in {
let isCommutable = 1 in {
// Multiply and add low.
def VMAL : TernaryVRRdGeneric<"vmal", 0xE7AA>;
- def VMALB : TernaryVRRd<"vmalb", 0xE7AA, z_muladd, v128b, v128b, 0>;
- def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
- def VMALF : TernaryVRRd<"vmalf", 0xE7AA, z_muladd, v128f, v128f, 2>;
+ def VMALB : TernaryVRRd<"vmalb", 0xE7AA, z_muladd<mul>, v128b, v128b, 0>;
+ def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd<mul>, v128h, v128h, 1>;
+ def VMALF : TernaryVRRd<"vmalf", 0xE7AA, z_muladd<mul>, v128f, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in {
- def VMALG : TernaryVRRd<"vmalg", 0xE7AA, z_muladd, v128g, v128g, 3>;
- def VMALQ : TernaryVRRd<"vmalq", 0xE7AA, z_muladd, v128q, v128q, 4>;
+ def VMALG : TernaryVRRd<"vmalg", 0xE7AA, z_muladd<mul>, v128g, v128g, 3>;
+ def VMALQ : TernaryVRRd<"vmalq", 0xE7AA, z_muladd<mul>, v128q, v128q, 4>;
}
// Multiply and add high.
def VMAH : TernaryVRRdGeneric<"vmah", 0xE7AB>;
- def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
- def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
- def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
+ def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, z_vmah, v128b, v128b, 0>;
+ def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, z_vmah, v128h, v128h, 1>;
+ def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, z_vmah, v128f, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in {
- def VMAHG : TernaryVRRd<"vmahg", 0xE7AB, int_s390_vmahg, v128g, v128g, 3>;
- def VMAHQ : TernaryVRRd<"vmahq", 0xE7AB, int_s390_vmahq, v128q, v128q, 4>;
+ def VMAHG : TernaryVRRd<"vmahg", 0xE7AB, z_vmah, v128g, v128g, 3>;
+ def VMAHQ : TernaryVRRd<"vmahq", 0xE7AB, z_vmah, v128q, v128q, 4>;
}
// Multiply and add logical high.
def VMALH : TernaryVRRdGeneric<"vmalh", 0xE7A9>;
- def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
- def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
- def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
+ def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, z_vmalh, v128b, v128b, 0>;
+ def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, z_vmalh, v128h, v128h, 1>;
+ def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, z_vmalh, v128f, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in {
- def VMALHG : TernaryVRRd<"vmalhg", 0xE7A9, int_s390_vmalhg, v128g, v128g, 3>;
- def VMALHQ : TernaryVRRd<"vmalhq", 0xE7A9, int_s390_vmalhq, v128q, v128q, 4>;
+ def VMALHG : TernaryVRRd<"vmalhg", 0xE7A9, z_vmalh, v128g, v128g, 3>;
+ def VMALHQ : TernaryVRRd<"vmalhq", 0xE7A9, z_vmalh, v128q, v128q, 4>;
}
// Multiply and add even.
def VMAE : TernaryVRRdGeneric<"vmae", 0xE7AE>;
- def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
- def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
- def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
+ def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, z_muladd<z_vme>, v128h, v128b, 0>;
+ def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, z_muladd<z_vme>, v128f, v128h, 1>;
+ def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, z_muladd<z_vme>, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMAEG : TernaryVRRd<"vmaeg", 0xE7AE, int_s390_vmaeg, v128q, v128g, 3>;
+ def VMAEG : TernaryVRRd<"vmaeg", 0xE7AE, z_muladd<z_vme>, v128q, v128g, 3>;
// Multiply and add logical even.
def VMALE : TernaryVRRdGeneric<"vmale", 0xE7AC>;
- def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
- def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
- def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
+ def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, z_muladd<z_vmle>, v128h, v128b, 0>;
+ def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, z_muladd<z_vmle>, v128f, v128h, 1>;
+ def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, z_muladd<z_vmle>, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMALEG : TernaryVRRd<"vmaleg", 0xE7AC, int_s390_vmaleg, v128q, v128g, 3>;
+ def VMALEG : TernaryVRRd<"vmaleg", 0xE7AC, z_muladd<z_vmle>, v128q, v128g, 3>;
// Multiply and add odd.
def VMAO : TernaryVRRdGeneric<"vmao", 0xE7AF>;
- def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
- def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
- def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
+ def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, z_muladd<z_vmo>, v128h, v128b, 0>;
+ def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, z_muladd<z_vmo>, v128f, v128h, 1>;
+ def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, z_muladd<z_vmo>, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMAOG : TernaryVRRd<"vmaog", 0xE7AF, int_s390_vmaog, v128q, v128g, 3>;
+ def VMAOG : TernaryVRRd<"vmaog", 0xE7AF, z_muladd<z_vmo>, v128q, v128g, 3>;
// Multiply and add logical odd.
def VMALO : TernaryVRRdGeneric<"vmalo", 0xE7AD>;
- def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
- def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
- def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
+ def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, z_muladd<z_vmlo>, v128h, v128b, 0>;
+ def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, z_muladd<z_vmlo>, v128f, v128h, 1>;
+ def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, z_muladd<z_vmlo>, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMALOG : TernaryVRRd<"vmalog", 0xE7AD, int_s390_vmalog, v128q, v128g, 3>;
+ def VMALOG : TernaryVRRd<"vmalog", 0xE7AD, z_muladd<z_vmlo>, v128q, v128g, 3>;
}
let isCommutable = 1 in {
// Multiply high.
def VMH : BinaryVRRcGeneric<"vmh", 0xE7A3>;
- def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
- def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
- def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
+ def VMHB : BinaryVRRc<"vmhb", 0xE7A3, mulhs, v128b, v128b, 0>;
+ def VMHH : BinaryVRRc<"vmhh", 0xE7A3, mulhs, v128h, v128h, 1>;
+ def VMHF : BinaryVRRc<"vmhf", 0xE7A3, mulhs, v128f, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in {
- def VMHG : BinaryVRRc<"vmhg", 0xE7A3, int_s390_vmhg, v128g, v128g, 3>;
- def VMHQ : BinaryVRRc<"vmhq", 0xE7A3, int_s390_vmhq, v128q, v128q, 4>;
+ def VMHG : BinaryVRRc<"vmhg", 0xE7A3, mulhs, v128g, v128g, 3>;
+ def VMHQ : BinaryVRRc<"vmhq", 0xE7A3, mulhs, v128q, v128q, 4>;
}
// Multiply logical high.
def VMLH : BinaryVRRcGeneric<"vmlh", 0xE7A1>;
- def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
- def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
- def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
+ def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, mulhu, v128b, v128b, 0>;
+ def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, mulhu, v128h, v128h, 1>;
+ def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, mulhu, v128f, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in {
- def VMLHG : BinaryVRRc<"vmlhg", 0xE7A1, int_s390_vmlhg, v128g, v128g, 3>;
- def VMLHQ : BinaryVRRc<"vmlhq", 0xE7A1, int_s390_vmlhq, v128q, v128q, 4>;
+ def VMLHG : BinaryVRRc<"vmlhg", 0xE7A1, mulhu, v128g, v128g, 3>;
+ def VMLHQ : BinaryVRRc<"vmlhq", 0xE7A1, mulhu, v128q, v128q, 4>;
}
// Multiply low.
@@ -808,39 +808,35 @@ let Predicates = [FeatureVector] in {
// Multiply even.
def VME : BinaryVRRcGeneric<"vme", 0xE7A6>;
- def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
- def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
- def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
+ def VMEB : BinaryVRRc<"vmeb", 0xE7A6, z_vme, v128h, v128b, 0>;
+ def VMEH : BinaryVRRc<"vmeh", 0xE7A6, z_vme, v128f, v128h, 1>;
+ def VMEF : BinaryVRRc<"vmef", 0xE7A6, z_vme, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMEG : BinaryVRRc<"vmeg", 0xE7A6, int_s390_vmeg, v128q, v128g, 3>;
+ def VMEG : BinaryVRRc<"vmeg", 0xE7A6, z_vme, v128q, v128g, 3>;
// Multiply logical even.
def VMLE : BinaryVRRcGeneric<"vmle", 0xE7A4>;
- def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
- def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
- def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
+ def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, z_vmle, v128h, v128b, 0>;
+ def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, z_vmle, v128f, v128h, 1>;
+ def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, z_vmle, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMLEG : BinaryVRRc<"vmleg", 0xE7A4, int_s390_vmleg, v128q, v128g, 3>;
+ def VMLEG : BinaryVRRc<"vmleg", 0xE7A4, z_vmle, v128q, v128g, 3>;
// Multiply odd.
def VMO : BinaryVRRcGeneric<"vmo", 0xE7A7>;
- def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
- def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
- def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
+ def VMOB : BinaryVRRc<"vmob", 0xE7A7, z_vmo, v128h, v128b, 0>;
+ def VMOH : BinaryVRRc<"vmoh", 0xE7A7, z_vmo, v128f, v128h, 1>;
+ def VMOF : BinaryVRRc<"vmof", 0xE7A7, z_vmo, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMOG : BinaryVRRc<"vmog", 0xE7A7, int_s390_vmog, v128q, v128g, 3>;
+ def VMOG : BinaryVRRc<"vmog", 0xE7A7, z_vmo, v128q, v128g, 3>;
// Multiply logical odd.
def VMLO : BinaryVRRcGeneric<"vmlo", 0xE7A5>;
- def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
- def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
- def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
+ def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, z_vmlo, v128h, v128b, 0>;
+ def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, z_vmlo, v128f, v128h, 1>;
+ def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, z_vmlo, v128g, v128f, 2>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMLOG : BinaryVRRc<"vmlog", 0xE7A5, int_s390_vmlog, v128q, v128g, 3>;
- }
- let Predicates = [FeatureVectorEnhancements3] in {
- def : Pat<(i128 (mulhs VR128:$x, VR128:$y)), (VMHQ VR128:$x, VR128:$y)>;
- def : Pat<(i128 (mulhu VR128:$x, VR128:$y)), (VMLHQ VR128:$x, VR128:$y)>;
+ def VMLOG : BinaryVRRc<"vmlog", 0xE7A5, z_vmlo, v128q, v128g, 3>;
}
// Multiply sum logical.
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 39670adaa257e..1cc153b79e289 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -315,6 +315,12 @@ def z_vaccc : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
def z_vscbi : SDNode<"SystemZISD::VSCBI", SDTIntBinOp>;
def z_vsbi : SDNode<"SystemZISD::VSBI", SDT_ZTernary>;
def z_vsbcbi : SDNode<"SystemZISD::VSBCBI", SDT_ZTernary>;
+def z_vmah : SDNode<"SystemZISD::VMAH", SDT_ZTernary>;
+def z_vmalh : SDNode<"SystemZISD::VMALH", SDT_ZTernary>;
+def z_vme : SDNode<"SystemZISD::VME", SDT_ZBinaryConv>;
+def z_vmle : SDNode<"SystemZISD::VMLE", SDT_ZBinaryConv>;
+def z_vmo : SDNode<"SystemZISD::VMO", SDT_ZBinaryConv>;
+def z_vmlo : SDNode<"SystemZISD::VMLO", SDT_ZBinaryConv>;
def z_loadbswap : SDNode<"SystemZISD::LRV", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -814,8 +820,9 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
def z_inegabs : PatFrag<(ops node:$src), (ineg (abs node:$src))>;
// Integer multiply-and-add
-def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (add (mul node:$src1, node:$src2), node:$src3)>;
+class z_muladd<SDPatternOperator mulop>
+ : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (add (mulop node:$src1, node:$src2), node:$src3)>;
// Alternatives to match operations with or without an overflow CC result.
def z_sadd : PatFrags<(ops node:$src1, node:$src2),
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-16.ll b/llvm/test/CodeGen/SystemZ/int-mul-16.ll
index d84ca93e3b12c..e3153fad00235 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-16.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-16.ll
@@ -37,22 +37,123 @@ define i128 @f2(i128 %a, i128 %b) {
ret i128 %res
}
-;; ; Multiply-and-add high signed.
-;; define i128 @f3(i128 %a, i128 %b, i128 %add) {
-;; ; CHECX-LABEL: f3:
-;; ; CHECX: # %bb.0:
-;; ; CHECX-NEXT: vl %v0, 0(%r3), 3
-;; ; CHECX-NEXT: vl %v1, 0(%r4), 3
-;; ; CHECX-NEXT: vl %v2, 0(%r5), 3
-;; ; CHECX-NEXT: vmhq %v0, %v0, %v1, %v2
-;; ; CHECX-NEXT: vst %v0, 0(%r2), 3
-;; ; CHECX-NEXT: br %r14
-;; %exta = sext i128 %a to i256
-;; %extb = sext i128 %b to i256
-;; %extadd = sext i128 %add to i256
-;; %extmul = mul i256 %exta, %extb
-;; %extres = add i256 %extmul, %extadd
-;; %shiftres = lshr i256 %extres, 128
-;; %res = trunc i256 %shiftres to i128
-;; ret i128 %res
-;; }
+; Multiply-and-add high signed.
+define i128 @f3(i128 %a, i128 %b, i128 %add) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r5), 3
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vmahq %v0, %v2, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %exta = sext i128 %a to i256
+ %extb = sext i128 %b to i256
+ %extadd = sext i128 %add to i256
+ %extmul = mul i256 %exta, %extb
+ %extres = add i256 %extmul, %extadd
+ %shiftres = lshr i256 %extres, 128
+ %res = trunc i256 %shiftres to i128
+ ret i128 %res
+}
+
+; Multiply-and-add high unsigned.
+define i128 @f4(i128 %a, i128 %b, i128 %add) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r5), 3
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vmalhq %v0, %v2, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %exta = zext i128 %a to i256
+ %extb = zext i128 %b to i256
+ %extadd = zext i128 %add to i256
+ %extmul = mul i256 %exta, %extb
+ %extres = add i256 %extmul, %extadd
+ %shiftres = lshr i256 %extres, 128
+ %res = trunc i256 %shiftres to i128
+ ret i128 %res
+}
+
+; Multiply-and-add high signed with immediate operand to multiply.
+define i128 @f5(i128 %a, i128 %add) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI4_0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v2, 0(%r1), 3
+; CHECK-NEXT: vmahq %v0, %v1, %v2, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %exta = sext i128 %a to i256
+ %extadd = sext i128 %add to i256
+ %extmul = mul i256 %exta, 12345
+ %extres = add i256 %extmul, %extadd
+ %shiftres = lshr i256 %extres, 128
+ %res = trunc i256 %shiftres to i128
+ ret i128 %res
+}
+
+; Multiply-and-add high unsigned with immediate operand to multiply.
+define i128 @f6(i128 %a, i128 %add) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI5_0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v2, 0(%r1), 3
+; CHECK-NEXT: vmalhq %v0, %v1, %v2, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %exta = zext i128 %a to i256
+ %extadd = zext i128 %add to i256
+ %extmul = mul i256 %exta, 12345
+ %extres = add i256 %extmul, %extadd
+ %shiftres = lshr i256 %extres, 128
+ %res = trunc i256 %shiftres to i128
+ ret i128 %res
+}
+
+; Multiply-and-add high signed with immediate operand to addition.
+define i128 @f7(i128 %a, i128 %b) {
+; CHECK-LABEL: f7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI6_0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v2, 0(%r1), 3
+; CHECK-NEXT: vmahq %v0, %v1, %v0, %v2
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %exta = sext i128 %a to i256
+ %extb = sext i128 %b to i256
+ %extmul = mul i256 %exta, %extb
+ %extres = add i256 %extmul, 12345
+ %shiftres = lshr i256 %extres, 128
+ %res = trunc i256 %shiftres to i128
+ ret i128 %res
+}
+
+; Multiply-and-add high unsigned with immediate operand to addition.
+define i128 @f8(i128 %a, i128 %b) {
+; CHECK-LABEL: f8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI7_0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v2, 0(%r1), 3
+; CHECK-NEXT: vmalhq %v0, %v1, %v0, %v2
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %exta = zext i128 %a to i256
+ %extb = zext i128 %b to i256
+ %extmul = mul i256 %exta, %extb
+ %extres = add i256 %extmul, 12345
+ %shiftres = lshr i256 %extres, 128
+ %res = trunc i256 %shiftres to i128
+ ret i128 %res
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
new file mode 100644
index 0000000000000..73c7a8dec5dfc
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test widening vector multiplication.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a v16i8 (even) -> v8i16 unsigned widening multiplication.
+define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmleb %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %zext1 = zext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %zext2 = zext <8 x i8> %shuf2 to <8 x i16>
+ %ret = mul <8 x i16> %zext1, %zext2
+ ret <8 x i16> %ret
+}
+
+; Test a v16i8 (odd) -> v8i16 unsigned widening multiplication.
+define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlob %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %zext1 = zext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %zext2 = zext <8 x i8> %shuf2 to <8 x i16>
+ %ret = mul <8 x i16> %zext1, %zext2
+ ret <8 x i16> %ret
+}
+
+; Test a v16i8 (even) -> v8i16 signed widening multiplication.
+define <8 x i16> @f3(<16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmeb %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %sext1 = sext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %sext2 = sext <8 x i8> %shuf2 to <8 x i16>
+ %ret = mul <8 x i16> %sext1, %sext2
+ ret <8 x i16> %ret
+}
+
+; Test a v16i8 (odd) -> v8i16 signed widening multiplication.
+define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmob %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %sext1 = sext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %sext2 = sext <8 x i8> %shuf2 to <8 x i16>
+ %ret = mul <8 x i16> %sext1, %sext2
+ ret <8 x i16> %ret
+}
+
+; Test a v8i16 (even) -> v4i32 unsigned widening multiplication.
+define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmleh %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %zext1 = zext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %zext2 = zext <4 x i16> %shuf2 to <4 x i32>
+ %ret = mul <4 x i32> %zext1, %zext2
+ ret <4 x i32> %ret
+}
+
+; Test a v8i16 (odd) -> v4i32 unsigned widening multiplication.
+define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmloh %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %zext1 = zext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %zext2 = zext <4 x i16> %shuf2 to <4 x i32>
+ %ret = mul <4 x i32> %zext1, %zext2
+ ret <4 x i32> %ret
+}
+
+; Test a v8i16 (even) -> v4i32 signed widening multiplication.
+define <4 x i32> @f7(<8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmeh %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %sext1 = sext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %sext2 = sext <4 x i16> %shuf2 to <4 x i32>
+ %ret = mul <4 x i32> %sext1, %sext2
+ ret <4 x i32> %ret
+}
+
+; Test a v8i16 (odd) -> v4i32 signed widening multiplication.
+define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmoh %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %sext1 = sext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %sext2 = sext <4 x i16> %shuf2 to <4 x i32>
+ %ret = mul <4 x i32> %sext1, %sext2
+ ret <4 x i32> %ret
+}
+
+; Test a v4i32 (even) -> v2i64 unsigned widening multiplication.
+define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlef %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %zext1 = zext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %zext2 = zext <2 x i32> %shuf2 to <2 x i64>
+ %ret = mul <2 x i64> %zext1, %zext2
+ ret <2 x i64> %ret
+}
+
+; Test a v4i32 (odd) -> v2i64 unsigned widening multiplication.
+define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlof %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %zext1 = zext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %zext2 = zext <2 x i32> %shuf2 to <2 x i64>
+ %ret = mul <2 x i64> %zext1, %zext2
+ ret <2 x i64> %ret
+}
+
+; Test a v4i32 (even) -> v2i64 signed widening multiplication.
+define <2 x i64> @f11(<4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f11:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmef %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %sext1 = sext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %sext2 = sext <2 x i32> %shuf2 to <2 x i64>
+ %ret = mul <2 x i64> %sext1, %sext2
+ ret <2 x i64> %ret
+}
+
+; Test a v4i32 (odd) -> v2i64 signed widening multiplication.
+define <2 x i64> @f12(<4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f12:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmof %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %sext1 = sext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %sext2 = sext <2 x i32> %shuf2 to <2 x i64>
+ %ret = mul <2 x i64> %sext1, %sext2
+ ret <2 x i64> %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-08.ll b/llvm/test/CodeGen/SystemZ/vec-mul-08.ll
new file mode 100644
index 0000000000000..5c5640428ed1e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-08.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test widening vector multiplication on arch15.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+
+; Test a v2i64 (even) -> i128 unsigned widening multiplication.
+define i128 @f1(<2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmleg %v0, %v24, %v26
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 0
+ %zext1 = zext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 0
+ %zext2 = zext i64 %elt2 to i128
+ %ret = mul i128 %zext1, %zext2
+ ret i128 %ret
+}
+
+; Test a v2i64 (odd) -> i128 unsigned widening multiplication.
+define i128 @f2(<2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlog %v0, %v24, %v26
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 1
+ %zext1 = zext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 1
+ %zext2 = zext i64 %elt2 to i128
+ %ret = mul i128 %zext1, %zext2
+ ret i128 %ret
+}
+
+; Test a v2i64 (even) -> i128 signed widening multiplication.
+define i128 @f3(<2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmeg %v0, %v24, %v26
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 0
+ %sext1 = sext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 0
+ %sext2 = sext i64 %elt2 to i128
+ %ret = mul i128 %sext1, %sext2
+ ret i128 %ret
+}
+
+; Test a v2i64 (odd) -> i128 signed widening multiplication.
+define i128 @f4(<2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmog %v0, %v24, %v26
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 1
+ %sext1 = sext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 1
+ %sext2 = sext i64 %elt2 to i128
+ %ret = mul i128 %sext1, %sext2
+ ret i128 %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
new file mode 100644
index 0000000000000..def57ca03bb0c
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test widening vector multiply-and-add.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a v16i8 (even) -> v8i16 unsigned widening multiply-and-add.
+define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaleb %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %zext1 = zext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %zext2 = zext <8 x i8> %shuf2 to <8 x i16>
+ %mul = mul <8 x i16> %zext1, %zext2
+ %ret = add <8 x i16> %mul, %val3
+ ret <8 x i16> %ret
+}
+
+; Test a v16i8 (odd) -> v8i16 unsigned widening multiply-and-add.
+define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalob %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %zext1 = zext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %zext2 = zext <8 x i8> %shuf2 to <8 x i16>
+ %mul = mul <8 x i16> %zext1, %zext2
+ %ret = add <8 x i16> %mul, %val3
+ ret <8 x i16> %ret
+}
+
+; Test a v16i8 (even) -> v8i16 signed widening multiply-and-add.
+define <8 x i16> @f3(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaeb %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %sext1 = sext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %sext2 = sext <8 x i8> %shuf2 to <8 x i16>
+ %mul = mul <8 x i16> %sext1, %sext2
+ %ret = add <8 x i16> %mul, %val3
+ ret <8 x i16> %ret
+}
+
+; Test a v16i8 (odd) -> v8i16 signed widening multiply-and-add.
+define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaob %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %sext1 = sext <8 x i8> %shuf1 to <8 x i16>
+ %shuf2 = shufflevector <16 x i8> %val2, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %sext2 = sext <8 x i8> %shuf2 to <8 x i16>
+ %mul = mul <8 x i16> %sext1, %sext2
+ %ret = add <8 x i16> %mul, %val3
+ ret <8 x i16> %ret
+}
+
+; Test a v8i16 (even) -> v4i32 unsigned widening multiply-and-add.
+define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaleh %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %zext1 = zext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %zext2 = zext <4 x i16> %shuf2 to <4 x i32>
+ %mul = mul <4 x i32> %zext1, %zext2
+ %ret = add <4 x i32> %mul, %val3
+ ret <4 x i32> %ret
+}
+
+; Test a v8i16 (odd) -> v4i32 unsigned widening multiply-and-add.
+define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaloh %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %zext1 = zext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %zext2 = zext <4 x i16> %shuf2 to <4 x i32>
+ %mul = mul <4 x i32> %zext1, %zext2
+ %ret = add <4 x i32> %mul, %val3
+ ret <4 x i32> %ret
+}
+
+; Test a v8i16 (even) -> v4i32 signed widening multiply-and-add.
+define <4 x i32> @f7(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
+; CHECK-LABEL: f7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaeh %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %sext1 = sext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %sext2 = sext <4 x i16> %shuf2 to <4 x i32>
+ %mul = mul <4 x i32> %sext1, %sext2
+ %ret = add <4 x i32> %mul, %val3
+ ret <4 x i32> %ret
+}
+
+; Test a v8i16 (odd) -> v4i32 signed widening multiply-and-add.
+define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
+; CHECK-LABEL: f8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaoh %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %sext1 = sext <4 x i16> %shuf1 to <4 x i32>
+ %shuf2 = shufflevector <8 x i16> %val2, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %sext2 = sext <4 x i16> %shuf2 to <4 x i32>
+ %mul = mul <4 x i32> %sext1, %sext2
+ %ret = add <4 x i32> %mul, %val3
+ ret <4 x i32> %ret
+}
+
+; Test a v4i32 (even) -> v2i64 unsigned widening multiply-and-add.
+define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
+; CHECK-LABEL: f9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalef %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %zext1 = zext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %zext2 = zext <2 x i32> %shuf2 to <2 x i64>
+ %mul = mul <2 x i64> %zext1, %zext2
+ %ret = add <2 x i64> %mul, %val3
+ ret <2 x i64> %ret
+}
+
+; Test a v4i32 (odd) -> v2i64 unsigned widening multiply-and-add.
+define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalof %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %zext1 = zext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %zext2 = zext <2 x i32> %shuf2 to <2 x i64>
+ %mul = mul <2 x i64> %zext1, %zext2
+ %ret = add <2 x i64> %mul, %val3
+ ret <2 x i64> %ret
+}
+
+; Test a v4i32 (even) -> v2i64 signed widening multiply-and-add.
+define <2 x i64> @f11(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
+; CHECK-LABEL: f11:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaef %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %sext1 = sext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+ %sext2 = sext <2 x i32> %shuf2 to <2 x i64>
+ %mul = mul <2 x i64> %sext1, %sext2
+ %ret = add <2 x i64> %mul, %val3
+ ret <2 x i64> %ret
+}
+
+; Test a v4i32 (odd) -> v2i64 signed widening multiply-and-add.
+define <2 x i64> @f12(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
+; CHECK-LABEL: f12:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaof %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %sext1 = sext <2 x i32> %shuf1 to <2 x i64>
+ %shuf2 = shufflevector <4 x i32> %val2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+ %sext2 = sext <2 x i32> %shuf2 to <2 x i64>
+ %mul = mul <2 x i64> %sext1, %sext2
+ %ret = add <2 x i64> %mul, %val3
+ ret <2 x i64> %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-10.ll b/llvm/test/CodeGen/SystemZ/vec-mul-10.ll
new file mode 100644
index 0000000000000..74d80ff25dd0b
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-10.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test widening vector multiply-and-add on arch15.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+
+; Test a v2i64 (even) -> i128 unsigned widening multiply-and-add.
+define i128 @f1(<2 x i64> %val1, <2 x i64> %val2, i128 %val3) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vmaleg %v0, %v24, %v26, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 0
+ %zext1 = zext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 0
+ %zext2 = zext i64 %elt2 to i128
+ %mul = mul i128 %zext1, %zext2
+ %ret = add i128 %mul, %val3
+ ret i128 %ret
+}
+
+; Test a v2i64 (odd) -> i128 unsigned widening multiply-and-add.
+define i128 @f2(<2 x i64> %val1, <2 x i64> %val2, i128 %val3) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vmalog %v0, %v24, %v26, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 1
+ %zext1 = zext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 1
+ %zext2 = zext i64 %elt2 to i128
+ %mul = mul i128 %zext1, %zext2
+ %ret = add i128 %mul, %val3
+ ret i128 %ret
+}
+
+; Test a v2i64 (even) -> i128 signed widening multiply-and-add.
+define i128 @f3(<2 x i64> %val1, <2 x i64> %val2, i128 %val3) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vmaeg %v0, %v24, %v26, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 0
+ %sext1 = sext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 0
+ %sext2 = sext i64 %elt2 to i128
+ %mul = mul i128 %sext1, %sext2
+ %ret = add i128 %mul, %val3
+ ret i128 %ret
+}
+
+; Test a v2i64 (odd) -> i128 signed widening multiply-and-add.
+define i128 @f4(<2 x i64> %val1, <2 x i64> %val2, i128 %val3) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vmaog %v0, %v24, %v26, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %elt1 = extractelement <2 x i64> %val1, i32 1
+ %sext1 = sext i64 %elt1 to i128
+ %elt2 = extractelement <2 x i64> %val2, i32 1
+ %sext2 = sext i64 %elt2 to i128
+ %mul = mul i128 %sext1, %sext2
+ %ret = add i128 %mul, %val3
+ ret i128 %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-11.ll b/llvm/test/CodeGen/SystemZ/vec-mul-11.ll
new file mode 100644
index 0000000000000..77e098c7f2eeb
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-11.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test high-part vector multiplication.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a v16i8 unsigned high-part multiplication.
+define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlhb %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %zext1 = zext <16 x i8> %val1 to <16 x i16>
+ %zext2 = zext <16 x i8> %val2 to <16 x i16>
+ %mulx = mul <16 x i16> %zext1, %zext2
+ %highx = lshr <16 x i16> %mulx, splat(i16 8)
+ %high = trunc <16 x i16> %highx to <16 x i8>
+ ret <16 x i8> %high
+}
+
+; Test a v16i8 signed high-part multiplication.
+define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmhb %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %sext1 = sext <16 x i8> %val1 to <16 x i16>
+ %sext2 = sext <16 x i8> %val2 to <16 x i16>
+ %mulx = mul <16 x i16> %sext1, %sext2
+ %highx = lshr <16 x i16> %mulx, splat(i16 8)
+ %high = trunc <16 x i16> %highx to <16 x i8>
+ ret <16 x i8> %high
+}
+
+; Test a v8i16 unsigned high-part multiplication.
+define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlhh %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %zext1 = zext <8 x i16> %val1 to <8 x i32>
+ %zext2 = zext <8 x i16> %val2 to <8 x i32>
+ %mulx = mul <8 x i32> %zext1, %zext2
+ %highx = lshr <8 x i32> %mulx, splat(i32 16)
+ %high = trunc <8 x i32> %highx to <8 x i16>
+ ret <8 x i16> %high
+}
+
+; Test a v8i16 signed high-part multiplication.
+define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmhh %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %sext1 = sext <8 x i16> %val1 to <8 x i32>
+ %sext2 = sext <8 x i16> %val2 to <8 x i32>
+ %mulx = mul <8 x i32> %sext1, %sext2
+ %highx = lshr <8 x i32> %mulx, splat(i32 16)
+ %high = trunc <8 x i32> %highx to <8 x i16>
+ ret <8 x i16> %high
+}
+
+; Test a v4i32 unsigned high-part multiplication.
+define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlhf %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %zext1 = zext <4 x i32> %val1 to <4 x i64>
+ %zext2 = zext <4 x i32> %val2 to <4 x i64>
+ %mulx = mul <4 x i64> %zext1, %zext2
+ %highx = lshr <4 x i64> %mulx, splat(i64 32)
+ %high = trunc <4 x i64> %highx to <4 x i32>
+ ret <4 x i32> %high
+}
+
+; Test a v4i32 signed high-part multiplication.
+define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmhf %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %sext1 = sext <4 x i32> %val1 to <4 x i64>
+ %sext2 = sext <4 x i32> %val2 to <4 x i64>
+ %mulx = mul <4 x i64> %sext1, %sext2
+ %highx = lshr <4 x i64> %mulx, splat(i64 32)
+ %high = trunc <4 x i64> %highx to <4 x i32>
+ ret <4 x i32> %high
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-12.ll b/llvm/test/CodeGen/SystemZ/vec-mul-12.ll
new file mode 100644
index 0000000000000..ee22ad832492c
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-12.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test high-part vector multiplication on arch15
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+
+; Test a v2i64 unsigned high-part multiplication.
+define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmlhg %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %zext1 = zext <2 x i64> %val1 to <2 x i128>
+ %zext2 = zext <2 x i64> %val2 to <2 x i128>
+ %mulx = mul <2 x i128> %zext1, %zext2
+ %highx = lshr <2 x i128> %mulx, splat(i128 64)
+ %high = trunc <2 x i128> %highx to <2 x i64>
+ ret <2 x i64> %high
+}
+
+; Test a v2i64 signed high-part multiplication.
+define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmhg %v24, %v24, %v26
+; CHECK-NEXT: br %r14
+ %sext1 = sext <2 x i64> %val1 to <2 x i128>
+ %sext2 = sext <2 x i64> %val2 to <2 x i128>
+ %mulx = mul <2 x i128> %sext1, %sext2
+ %highx = lshr <2 x i128> %mulx, splat(i128 64)
+ %high = trunc <2 x i128> %highx to <2 x i64>
+ ret <2 x i64> %high
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-13.ll b/llvm/test/CodeGen/SystemZ/vec-mul-13.ll
new file mode 100644
index 0000000000000..4d424d9303e8a
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-13.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test high-part vector multiply-and-add.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a v16i8 unsigned high-part multiply-and-add.
+define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalhb %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %zext1 = zext <16 x i8> %val1 to <16 x i16>
+ %zext2 = zext <16 x i8> %val2 to <16 x i16>
+ %zext3 = zext <16 x i8> %val3 to <16 x i16>
+ %mulx = mul <16 x i16> %zext1, %zext2
+ %addx = add <16 x i16> %mulx, %zext3
+ %highx = lshr <16 x i16> %addx, splat(i16 8)
+ %high = trunc <16 x i16> %highx to <16 x i8>
+ ret <16 x i8> %high
+}
+
+; Test a v16i8 signed high-part multiply-and-add.
+define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmahb %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %sext1 = sext <16 x i8> %val1 to <16 x i16>
+ %sext2 = sext <16 x i8> %val2 to <16 x i16>
+ %sext3 = sext <16 x i8> %val3 to <16 x i16>
+ %mulx = mul <16 x i16> %sext1, %sext2
+ %addx = add <16 x i16> %mulx, %sext3
+ %highx = lshr <16 x i16> %addx, splat(i16 8)
+ %high = trunc <16 x i16> %highx to <16 x i8>
+ ret <16 x i8> %high
+}
+
+; Test a v8i16 unsigned high-part multiply-and-add.
+define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalhh %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %zext1 = zext <8 x i16> %val1 to <8 x i32>
+ %zext2 = zext <8 x i16> %val2 to <8 x i32>
+ %zext3 = zext <8 x i16> %val3 to <8 x i32>
+ %mulx = mul <8 x i32> %zext1, %zext2
+ %addx = add <8 x i32> %mulx, %zext3
+ %highx = lshr <8 x i32> %addx, splat(i32 16)
+ %high = trunc <8 x i32> %highx to <8 x i16>
+ ret <8 x i16> %high
+}
+
+; Test a v8i16 signed high-part multiply-and-add.
+define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmahh %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %sext1 = sext <8 x i16> %val1 to <8 x i32>
+ %sext2 = sext <8 x i16> %val2 to <8 x i32>
+ %sext3 = sext <8 x i16> %val3 to <8 x i32>
+ %mulx = mul <8 x i32> %sext1, %sext2
+ %addx = add <8 x i32> %mulx, %sext3
+ %highx = lshr <8 x i32> %addx, splat(i32 16)
+ %high = trunc <8 x i32> %highx to <8 x i16>
+ ret <8 x i16> %high
+}
+
+; Test a v4i32 unsigned high-part multiply-and-add.
+define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalhf %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %zext1 = zext <4 x i32> %val1 to <4 x i64>
+ %zext2 = zext <4 x i32> %val2 to <4 x i64>
+ %zext3 = zext <4 x i32> %val3 to <4 x i64>
+ %mulx = mul <4 x i64> %zext1, %zext2
+ %addx = add <4 x i64> %mulx, %zext3
+ %highx = lshr <4 x i64> %addx, splat(i64 32)
+ %high = trunc <4 x i64> %highx to <4 x i32>
+ ret <4 x i32> %high
+}
+
+; Test a v4i32 signed high-part multiply-and-add.
+define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmahf %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %sext1 = sext <4 x i32> %val1 to <4 x i64>
+ %sext2 = sext <4 x i32> %val2 to <4 x i64>
+ %sext3 = sext <4 x i32> %val3 to <4 x i64>
+ %mulx = mul <4 x i64> %sext1, %sext2
+ %addx = add <4 x i64> %mulx, %sext3
+ %highx = lshr <4 x i64> %addx, splat(i64 32)
+ %high = trunc <4 x i64> %highx to <4 x i32>
+ ret <4 x i32> %high
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-14.ll b/llvm/test/CodeGen/SystemZ/vec-mul-14.ll
new file mode 100644
index 0000000000000..4c3b05486d6e1
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-14.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test high-part vector multiply-and-add on arch15
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=arch15 | FileCheck %s
+
+; Test a v2i64 unsigned high-part multiply-and-add.
+define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmalhg %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %zext1 = zext <2 x i64> %val1 to <2 x i128>
+ %zext2 = zext <2 x i64> %val2 to <2 x i128>
+ %zext3 = zext <2 x i64> %val3 to <2 x i128>
+ %mulx = mul <2 x i128> %zext1, %zext2
+ %addx = add <2 x i128> %mulx, %zext3
+ %highx = lshr <2 x i128> %addx, splat(i128 64)
+ %high = trunc <2 x i128> %highx to <2 x i64>
+ ret <2 x i64> %high
+}
+
+; Test a v2i64 signed high-part multiply-and-add.
+define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmahg %v24, %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+ %sext1 = sext <2 x i64> %val1 to <2 x i128>
+ %sext2 = sext <2 x i64> %val2 to <2 x i128>
+ %sext3 = sext <2 x i64> %val3 to <2 x i128>
+ %mulx = mul <2 x i128> %sext1, %sext2
+ %addx = add <2 x i128> %mulx, %sext3
+ %highx = lshr <2 x i128> %addx, splat(i128 64)
+ %high = trunc <2 x i128> %highx to <2 x i64>
+ ret <2 x i64> %high
+}
More information about the llvm-commits
mailing list