[llvm] 350247a - [AArch64] Rearrange mul(dup(sext/zext)) to mul(sext/zext(dup))

Wed Jan 6 08:10:35 PST 2021

Author: Nicholas Guy
Date: 2021-01-06T16:02:16Z
New Revision: 350247a93c07906300b79955ff882004a92ae368

URL: https://github.com/llvm/llvm-project/commit/350247a93c07906300b79955ff882004a92ae368
DIFF: https://github.com/llvm/llvm-project/commit/350247a93c07906300b79955ff882004a92ae368.diff

LOG: [AArch64] Rearrange mul(dup(sext/zext)) to mul(sext/zext(dup))

Performing this rearrangement allows for existing patterns
to match cases where the vector may be built after an extend,
instead of before.

Differential Revision: https://reviews.llvm.org/D91255

Added: 
    llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
    llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 41dc285a368d..40435c12ca3b 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11705,9 +11705,152 @@ static bool IsSVECntIntrinsic(SDValue S) {
   return false;
 }
 
+/// Calculates what the pre-extend type is, based on the extension
+/// operation node provided by \p Extend.
+///
+/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
+/// pre-extend type is pulled directly from the operand, while other extend
+/// operations need a bit more inspection to get this information.
+///
+/// \param Extend The SDNode from the DAG that represents the extend operation
+/// \param DAG The SelectionDAG hosting the \p Extend node
+///
+/// \returns The type representing the \p Extend source type, or \p MVT::Other
+/// if no valid type can be determined
+static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
+  switch (Extend.getOpcode()) {
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return Extend.getOperand(0).getValueType();
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+  case ISD::SIGN_EXTEND_INREG: {
+    VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
+    if (!TypeNode)
+      return MVT::Other;
+    return TypeNode->getVT();
+  }
+  case ISD::AND: {
+    ConstantSDNode *Constant =
+        dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
+    if (!Constant)
+      return MVT::Other;
+
+    uint32_t Mask = Constant->getZExtValue();
+
+    if (Mask == UCHAR_MAX)
+      return MVT::i8;
+    else if (Mask == USHRT_MAX)
+      return MVT::i16;
+    else if (Mask == UINT_MAX)
+      return MVT::i32;
+
+    return MVT::Other;
+  }
+  default:
+    return MVT::Other;
+  }
+
+  llvm_unreachable("Code path unhandled in calculatePreExtendType!");
+}
+
+/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
+/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
+                                                SelectionDAG &DAG) {
+
+  ShuffleVectorSDNode *ShuffleNode =
+      dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
+  if (!ShuffleNode)
+    return SDValue();
+
+  // Ensuring the mask is zero before continuing
+  if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
+    return SDValue();
+
+  SDValue InsertVectorElt = VectorShuffle.getOperand(0);
+
+  if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  SDValue InsertLane = InsertVectorElt.getOperand(2);
+  ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
+  // Ensures the insert is inserting into lane 0
+  if (!Constant || Constant->getZExtValue() != 0)
+    return SDValue();
+
+  SDValue Extend = InsertVectorElt.getOperand(1);
+  unsigned ExtendOpcode = Extend.getOpcode();
+
+  bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
+                ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
+                ExtendOpcode == ISD::AssertSext;
+  if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
+      ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
+    return SDValue();
+
+  EVT TargetType = VectorShuffle.getValueType();
+  EVT PreExtendType = calculatePreExtendType(Extend, DAG);
+
+  if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
+       TargetType != MVT::v2i64) ||
+      (PreExtendType == MVT::Other))
+    return SDValue();
+
+  EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
+
+  if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
+    return SDValue();
+
+  if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+    return SDValue();
+
+  SDLoc DL(VectorShuffle);
+
+  SDValue InsertVectorNode = DAG.getNode(
+      InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
+      Extend.getOperand(0), DAG.getConstant(0, DL, MVT::i64));
+
+  std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
+
+  SDValue VectorShuffleNode =
+      DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
+                           DAG.getUNDEF(PreExtendVT), ShuffleMask);
+
+  SDValue ExtendNode =
+      DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, TargetType,
+                  VectorShuffleNode, DAG.getValueType(TargetType));
+
+  return ExtendNode;
+}
+
+/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
+/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
+  // If the value type isn't a vector, none of the operands are going to be dups
+  if (!Mul->getValueType(0).isVector())
+    return SDValue();
+
+  SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
+  SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+
+  // Neither operands have been changed, don't make any further changes
+  if (!Op0 && !Op1)
+    return SDValue();
+
+  SDLoc DL(Mul);
+  return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
+                     Op0 ? Op0 : Mul->getOperand(0),
+                     Op1 ? Op1 : Mul->getOperand(1));
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
+
+  if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
+    return Ext;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
new file mode 100644
index 000000000000..082e2db52705
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
@@ -0,0 +1,327 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define <vscale x 2 x i16> @dupsext_v2i8_v2i16(i8 %src, <vscale x 2 x i16> %b) {
+; CHECK-LABEL: dupsext_v2i8_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i16
+    %broadcast.splatinsert = insertelement <vscale x 2 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <vscale x 2 x i16> %broadcast.splatinsert, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nsw <vscale x 2 x i16> %broadcast.splat, %b
+    ret <vscale x 2 x i16> %out
+}
+
+define <vscale x 4 x i16> @dupsext_v4i8_v4i16(i8 %src, <vscale x 4 x i16> %b) {
+; CHECK-LABEL: dupsext_v4i8_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i16
+    %broadcast.splatinsert = insertelement <vscale x 4 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <vscale x 4 x i16> %broadcast.splatinsert, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
+    %out = mul nsw <vscale x 4 x i16> %broadcast.splat, %b
+    ret <vscale x 4 x i16> %out
+}
+
+define <vscale x 8 x i16> @dupsext_v8i8_v8i16(i8 %src, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: dupsext_v8i8_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i16
+    %broadcast.splatinsert = insertelement <vscale x 8 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <vscale x 8 x i16> %broadcast.splatinsert, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+    %out = mul nsw <vscale x 8 x i16> %broadcast.splat, %b
+    ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 2 x i32> @dupsext_v2i8_v2i32(i8 %src, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: dupsext_v2i8_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 2 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 2 x i32> %broadcast.splatinsert, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nsw <vscale x 2 x i32> %broadcast.splat, %b
+    ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 4 x i32> @dupsext_v4i8_v4i32(i8 %src, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: dupsext_v4i8_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 4 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+    %out = mul nsw <vscale x 4 x i32> %broadcast.splat, %b
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @dupsext_v2i8_v2i64(i8 %src, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: dupsext_v2i8_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtb x8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i64
+    %broadcast.splatinsert = insertelement <vscale x 2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <vscale x 2 x i64> %broadcast.splatinsert, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nsw <vscale x 2 x i64> %broadcast.splat, %b
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i32> @dupsext_v2i16_v2i32(i16 %src, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: dupsext_v2i16_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i16 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 2 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 2 x i32> %broadcast.splatinsert, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nsw <vscale x 2 x i32> %broadcast.splat, %b
+    ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 4 x i32> @dupsext_v4i16_v4i32(i16 %src, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: dupsext_v4i16_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i16 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 4 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+    %out = mul nsw <vscale x 4 x i32> %broadcast.splat, %b
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @dupsext_v2i16_v2i64(i16 %src, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: dupsext_v2i16_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxth x8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i16 %src to i64
+    %broadcast.splatinsert = insertelement <vscale x 2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <vscale x 2 x i64> %broadcast.splatinsert, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nsw <vscale x 2 x i64> %broadcast.splat, %b
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @dupsext_v2i32_v2i64(i32 %src, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: dupsext_v2i32_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i32 %src to i64
+    %broadcast.splatinsert = insertelement <vscale x 2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <vscale x 2 x i64> %broadcast.splatinsert, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nsw <vscale x 2 x i64> %broadcast.splat, %b
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i16> @dupzext_v2i8_v2i16(i8 %src, <vscale x 2 x i16> %b) {
+; CHECK-LABEL: dupzext_v2i8_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i16
+    %broadcast.splatinsert = insertelement <vscale x 2 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <vscale x 2 x i16> %broadcast.splatinsert, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nuw <vscale x 2 x i16> %broadcast.splat, %b
+    ret <vscale x 2 x i16> %out
+}
+
+define <vscale x 4 x i16> @dupzext_v4i8_v4i16(i8 %src, <vscale x 4 x i16> %b) {
+; CHECK-LABEL: dupzext_v4i8_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i16
+    %broadcast.splatinsert = insertelement <vscale x 4 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <vscale x 4 x i16> %broadcast.splatinsert, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
+    %out = mul nuw <vscale x 4 x i16> %broadcast.splat, %b
+    ret <vscale x 4 x i16> %out
+}
+
+define <vscale x 8 x i16> @dupzext_v8i8_v8i16(i8 %src, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: dupzext_v8i8_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i16
+    %broadcast.splatinsert = insertelement <vscale x 8 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <vscale x 8 x i16> %broadcast.splatinsert, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+    %out = mul nuw <vscale x 8 x i16> %broadcast.splat, %b
+    ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 2 x i32> @dupzext_v2i8_v2i32(i8 %src, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: dupzext_v2i8_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 2 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 2 x i32> %broadcast.splatinsert, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nuw <vscale x 2 x i32> %broadcast.splat, %b
+    ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 4 x i32> @dupzext_v4i8_v4i32(i8 %src, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: dupzext_v4i8_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 4 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+    %out = mul nuw <vscale x 4 x i32> %broadcast.splat, %b
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @dupzext_v2i8_v2i64(i8 %src, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: dupzext_v2i8_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    and x8, x0, #0xff
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i64
+    %broadcast.splatinsert = insertelement <vscale x 2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <vscale x 2 x i64> %broadcast.splatinsert, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nuw <vscale x 2 x i64> %broadcast.splat, %b
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i32> @dupzext_v2i16_v2i32(i16 %src, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: dupzext_v2i16_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i16 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 2 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 2 x i32> %broadcast.splatinsert, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nuw <vscale x 2 x i32> %broadcast.splat, %b
+    ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 4 x i32> @dupzext_v4i16_v4i32(i16 %src, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: dupzext_v4i16_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i16 %src to i32
+    %broadcast.splatinsert = insertelement <vscale x 4 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+    %out = mul nuw <vscale x 4 x i32> %broadcast.splat, %b
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @dupzext_v2i16_v2i64(i16 %src, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: dupzext_v2i16_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    and x8, x0, #0xffff
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i16 %src to i64
+    %broadcast.splatinsert = insertelement <vscale x 2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <vscale x 2 x i64> %broadcast.splatinsert, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nuw <vscale x 2 x i64> %broadcast.splat, %b
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @dupzext_v2i32_v2i64(i32 %src, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: dupzext_v2i32_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i32 %src to i64
+    %broadcast.splatinsert = insertelement <vscale x 2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <vscale x 2 x i64> %broadcast.splatinsert, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+    %out = mul nuw <vscale x 2 x i64> %broadcast.splat, %b
+    ret <vscale x 2 x i64> %out
+}

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
new file mode 100644
index 000000000000..07ac3c87d143
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s
+
+; Supported combines
+
+define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) {
+; CHECK-LABEL: dupsext_v8i8_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.8b, w0
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i16
+    %ext.b = sext <8 x i8> %b to <8 x i16>
+    %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+    %out = mul nsw <8 x i16> %broadcast.splat, %ext.b
+    ret <8 x i16> %out
+}
+
+define <8 x i16> @dupzext_v8i8_v8i16(i8 %src, <8 x i8> %b) {
+; CHECK-LABEL: dupzext_v8i8_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.8b, w0
+; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i8 %src to i16
+    %ext.b = zext <8 x i8> %b to <8 x i16>
+    %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+    %out = mul nuw <8 x i16> %broadcast.splat, %ext.b
+    ret <8 x i16> %out
+}
+
+define <4 x i32> @dupsext_v4i16_v4i32(i16 %src, <4 x i16> %b) {
+; CHECK-LABEL: dupsext_v4i16_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.4h, w0
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i16 %src to i32
+    %ext.b = sext <4 x i16> %b to <4 x i32>
+    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+    %out = mul nsw <4 x i32> %broadcast.splat, %ext.b
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @dupzext_v4i16_v4i32(i16 %src, <4 x i16> %b) {
+; CHECK-LABEL: dupzext_v4i16_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.4h, w0
+; CHECK-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i16 %src to i32
+    %ext.b = zext <4 x i16> %b to <4 x i32>
+    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0
+    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+    %out = mul nuw <4 x i32> %broadcast.splat, %ext.b
+    ret <4 x i32> %out
+}
+
+define <2 x i64> @dupsext_v2i32_v2i64(i32 %src, <2 x i32> %b) {
+; CHECK-LABEL: dupsext_v2i32_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i32 %src to i64
+    %ext.b = sext <2 x i32> %b to <2 x i64>
+    %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+    %out = mul nsw <2 x i64> %broadcast.splat, %ext.b
+    ret <2 x i64> %out
+}
+
+define <2 x i64> @dupzext_v2i32_v2i64(i32 %src, <2 x i32> %b) {
+; CHECK-LABEL: dupzext_v2i32_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    umull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i32 %src to i64
+    %ext.b = zext <2 x i32> %b to <2 x i64>
+    %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+    %out = mul nuw <2 x i64> %broadcast.splat, %ext.b
+    ret <2 x i64> %out
+}
+
+; Unsupported combines
+
+define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) {
+; CHECK-LABEL: dupsext_v2i8_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i16
+    %ext.b = sext <2 x i8> %b to <2 x i16>
+    %broadcast.splatinsert = insertelement <2 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <2 x i16> %broadcast.splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer
+    %out = mul nsw <2 x i16> %broadcast.splat, %ext.b
+    ret <2 x i16> %out
+}
+
+define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) {
+; CHECK-LABEL: dupzext_v2i16_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    and x8, x0, #0xffff
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    mul x10, x8, x10
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    fmov d0, x10
+; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i16 %src to i64
+    %ext.b = zext <2 x i16> %b to <2 x i64>
+    %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0
+    %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+    %out = mul nuw <2 x i64> %broadcast.splat, %ext.b
+    ret <2 x i64> %out
+}
+
+; dupsext_v4i8_v4i16
+; dupsext_v2i8_v2i32
+; dupsext_v4i8_v4i32
+; dupsext_v2i8_v2i64
+; dupsext_v2i16_v2i32
+; dupsext_v2i16_v2i64
+; dupzext_v2i8_v2i16
+; dupzext_v4i8_v4i16
+; dupzext_v2i8_v2i32
+; dupzext_v4i8_v4i32
+; dupzext_v2i8_v2i64
+; dupzext_v2i16_v2i32
+; dupzext_v2i16_v2i64
+
+; Unsupported states
+
+define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) {
+; CHECK-LABEL: nonsplat_shuffleinsert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+    %in = sext i8 %src to i16
+    %ext.b = sext <8 x i8> %b to <8 x i16>
+    %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 1
+    %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+    %out = mul nsw <8 x i16> %broadcast.splat, %ext.b
+    ret <8 x i16> %out
+}
+
+define <8 x i16> @missing_insert(<8 x i8> %b) {
+; CHECK-LABEL: missing_insert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+    %ext.b = sext <8 x i8> %b to <8 x i16>
+    %broadcast.splat = shufflevector <8 x i16> %ext.b, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+    %out = mul nsw <8 x i16> %broadcast.splat, %ext.b
+    ret <8 x i16> %out
+}