[llvm] r353486 - [AArch64] Fix condition for "high-vector" DUP optimizations.

Thu Feb 7 16:23:35 PST 2019

Author: efriedma
Date: Thu Feb  7 16:23:35 2019
New Revision: 353486

URL: http://llvm.org/viewvc/llvm-project?rev=353486&view=rev
Log:
[AArch64] Fix condition for "high-vector" DUP optimizations.

AArch64 NEON has a bunch of instructions with a "2" suffix that extract
the top half of the source vectors, instead of the bottom half.  We have
some DAGCombines to try to take advantage of that.  However, they
assumed that any EXTRACT_VECTOR was extracting the high half of the
vector in question.

This issue has apparently existed since the AArch64 backend was merged.

Fixes https://bugs.llvm.org/show_bug.cgi?id=40632 .

Differential Revision: https://reviews.llvm.org/D57862


Modified:
    llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll
    llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll
    llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Thu Feb  7 16:23:35 2019
@@ -9722,12 +9722,13 @@ static SDValue tryExtendDUPToExtractHigh
                      DAG.getConstant(NumElems, dl, MVT::i64));
 }
 
-static bool isEssentiallyExtractSubvector(SDValue N) {
-  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-    return true;
-
-  return N.getOpcode() == ISD::BITCAST &&
-         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+static bool isEssentiallyExtractHighSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::BITCAST)
+    N = N.getOperand(0);
+  if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+  return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+         N.getOperand(0).getValueType().getVectorNumElements() / 2;
 }
 
 /// Helper structure to keep track of ISD::SET_CC operands.
@@ -9894,13 +9895,13 @@ static SDValue performAddSubLongCombine(
 
   // It's not worth doing if at least one of the inputs isn't already an
   // extract, but we don't know which it'll be so we have to try both.
-  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+  if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
     if (!RHS.getNode())
       return SDValue();
 
     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
-  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+  } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
     if (!LHS.getNode())
       return SDValue();
@@ -9933,11 +9934,11 @@ static SDValue tryCombineLongOpWithDup(u
   // Either node could be a DUP, but it's not worth doing both of them (you'd
   // just as well use the non-high version) so look for a corresponding extract
   // operation on the other "wing".
-  if (isEssentiallyExtractSubvector(LHS)) {
+  if (isEssentiallyExtractHighSubvector(LHS)) {
     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
     if (!RHS.getNode())
       return SDValue();
-  } else if (isEssentiallyExtractSubvector(RHS)) {
+  } else if (isEssentiallyExtractHighSubvector(RHS)) {
     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
     if (!LHS.getNode())
       return SDValue();

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll Thu Feb  7 16:23:35 2019
@@ -885,6 +885,20 @@ declare double @llvm.fabs.f64(double) no
 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uabdl_from_extract_dup:
 ; CHECK-NOT: ext.16b
+; CHECK: uabdl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
 ; CHECK: uabdl2.2d
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -899,6 +913,20 @@ define <2 x i64> @uabdl_from_extract_dup
 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: sabdl_from_extract_dup:
 ; CHECK-NOT: ext.16b
+; CHECK: sabdl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
 ; CHECK: sabdl2.2d
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll Thu Feb  7 16:23:35 2019
@@ -738,6 +738,22 @@ declare <2 x float> @llvm.aarch64.neon.a
 declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
+define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl2_duprhs
 ; CHECK-NOT: ext.16b
@@ -754,6 +770,22 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32
   ret <2 x i64> %res
 }
 
+define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: saddl2_duplhs
 ; CHECK-NOT: ext.16b
@@ -770,6 +802,22 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs
   ret <2 x i64> %res
 }
 
+define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: usubl2_duprhs
 ; CHECK-NOT: ext.16b
@@ -786,8 +834,24 @@ define <2 x i64> @usubl2_duprhs(<4 x i32
   ret <2 x i64> %res
 }
 
+define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl_duplhs:
+; CHECK-NOT: ext.16b
+; CHECK: ssubl.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs
+; CHECK-LABEL: ssubl2_duplhs:
 ; CHECK-NOT: ext.16b
 ; CHECK: ssubl2.2d
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll Thu Feb  7 16:23:35 2019
@@ -1338,6 +1338,19 @@ entry:
   ret <4 x i32> %vmull2.i
 }
 
+define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6a:
+; CHECK-NEXT: smull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo7:
 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
@@ -1351,6 +1364,20 @@ entry:
   ret <2 x i64> %vmull2.i
 }
 
+define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7a:
+; CHECK-NEXT: smull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+
 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo8:
 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
@@ -1364,6 +1391,19 @@ entry:
   ret <4 x i32> %vmull2.i
 }
 
+define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8a:
+; CHECK-NEXT: umull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo9:
 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
@@ -1377,6 +1417,19 @@ entry:
   ret <2 x i64> %vmull2.i
 }
 
+define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9a:
+; CHECK-NEXT: umull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
 ; CHECK-LABEL: bar0:
 ; CHECK: smlal2.8h v0, v1, v2
@@ -1667,6 +1720,24 @@ entry:
   ret <2 x i64> %vmull2.i
 }
 
+define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_low_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
 entry:
 ; CHECK: vmull_high_n_s16_test
@@ -1804,8 +1875,21 @@ define <2 x i64> @mlal_from_two_extracts
   ret <2 x i64> %sum
 }
 
-define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: mull_from_extract_dup:
+define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_high:
 ; CHECK-NOT: ext
 ; CHECK: sqdmull2.2d
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@@ -1817,8 +1901,21 @@ define <2 x i64> @mull_from_extract_dup(
   ret <2 x i64> %res
 }
 
-define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup:
+define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_high:
 ; CHECK-NOT: ext
 ; CHECK: pmull2.8h
   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
@@ -1830,8 +1927,20 @@ define <8 x i16> @pmull_from_extract_dup
   ret <8 x i16> %res
 }
 
-define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane:
+define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: pmull2.8h
 
@@ -1842,8 +1951,20 @@ define <8 x i16> @pmull_from_extract_dup
   ret <8 x i16> %res
 }
 
-define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmull_from_extract_duplane:
+define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: sqdmull2.2d
 
@@ -1854,8 +1975,21 @@ define <2 x i64> @sqdmull_from_extract_d
   ret <2 x i64> %res
 }
 
-define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmlal_from_extract_duplane:
+define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmlal.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: sqdmlal2.2d
 
@@ -1867,8 +2001,21 @@ define <2 x i64> @sqdmlal_from_extract_d
   ret <2 x i64> %sum
 }
 
-define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: umlal_from_extract_duplane:
+define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: umlal.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = add <2 x i64> %accum, %res
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: umlal2.2d