[llvm] r353486 - [AArch64] Fix condition for "high-vector" DUP optimizations.
Eli Friedman via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 7 16:23:35 PST 2019
Author: efriedma
Date: Thu Feb 7 16:23:35 2019
New Revision: 353486
URL: http://llvm.org/viewvc/llvm-project?rev=353486&view=rev
Log:
[AArch64] Fix condition for "high-vector" DUP optimizations.
AArch64 NEON has a bunch of instructions with a "2" suffix that extract
the top half of the source vectors, instead of the bottom half. We have
some DAGCombines to try to take advantage of that. However, they
assumed that any EXTRACT_VECTOR was extracting the high half of the
vector in question.
This issue has apparently existed since the AArch64 backend was merged.
Fixes https://bugs.llvm.org/show_bug.cgi?id=40632 .
Differential Revision: https://reviews.llvm.org/D57862
Modified:
llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll
llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll
llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Thu Feb 7 16:23:35 2019
@@ -9722,12 +9722,13 @@ static SDValue tryExtendDUPToExtractHigh
DAG.getConstant(NumElems, dl, MVT::i64));
}
-static bool isEssentiallyExtractSubvector(SDValue N) {
- if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- return true;
-
- return N.getOpcode() == ISD::BITCAST &&
- N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+static bool isEssentiallyExtractHighSubvector(SDValue N) {
+ if (N.getOpcode() == ISD::BITCAST)
+ N = N.getOperand(0);
+ if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+ return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+ N.getOperand(0).getValueType().getVectorNumElements() / 2;
}
/// Helper structure to keep track of ISD::SET_CC operands.
@@ -9894,13 +9895,13 @@ static SDValue performAddSubLongCombine(
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
- if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+ if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
- } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+ } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
@@ -9933,11 +9934,11 @@ static SDValue tryCombineLongOpWithDup(u
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
- if (isEssentiallyExtractSubvector(LHS)) {
+ if (isEssentiallyExtractHighSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
- } else if (isEssentiallyExtractSubvector(RHS)) {
+ } else if (isEssentiallyExtractHighSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll Thu Feb 7 16:23:35 2019
@@ -885,6 +885,20 @@ declare double @llvm.fabs.f64(double) no
define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uabdl_from_extract_dup:
; CHECK-NOT: ext.16b
+; CHECK: uabdl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ %res1 = zext <2 x i32> %res to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
; CHECK: uabdl2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -899,6 +913,20 @@ define <2 x i64> @uabdl_from_extract_dup
define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: sabdl_from_extract_dup:
; CHECK-NOT: ext.16b
+; CHECK: sabdl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ %res1 = zext <2 x i32> %res to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
; CHECK: sabdl2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll Thu Feb 7 16:23:35 2019
@@ -738,6 +738,22 @@ declare <2 x float> @llvm.aarch64.neon.a
declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+ %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+ %res = add <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uaddl2_duprhs
; CHECK-NOT: ext.16b
@@ -754,6 +770,22 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32
ret <2 x i64> %res
}
+define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl.2d
+ %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+ %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+ %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+ %res = add <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: saddl2_duplhs
; CHECK-NOT: ext.16b
@@ -770,6 +802,22 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs
ret <2 x i64> %res
}
+define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+ %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+ %res = sub <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: usubl2_duprhs
; CHECK-NOT: ext.16b
@@ -786,8 +834,24 @@ define <2 x i64> @usubl2_duprhs(<4 x i32
ret <2 x i64> %res
}
+define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl_duplhs:
+; CHECK-NOT: ext.16b
+; CHECK: ssubl.2d
+ %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+ %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+ %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+ %res = sub <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs
+; CHECK-LABEL: ssubl2_duplhs:
; CHECK-NOT: ext.16b
; CHECK: ssubl2.2d
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll?rev=353486&r1=353485&r2=353486&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll Thu Feb 7 16:23:35 2019
@@ -1338,6 +1338,19 @@ entry:
ret <4 x i32> %vmull2.i
}
+define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6a:
+; CHECK-NEXT: smull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo7:
; CHECK-NEXT: smull2.2d v0, v1, v2[1]
@@ -1351,6 +1364,20 @@ entry:
ret <2 x i64> %vmull2.i
}
+define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7a:
+; CHECK-NEXT: smull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+
define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo8:
; CHECK-NEXT: umull2.4s v0, v1, v2[1]
@@ -1364,6 +1391,19 @@ entry:
ret <4 x i32> %vmull2.i
}
+define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8a:
+; CHECK-NEXT: umull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo9:
; CHECK-NEXT: umull2.2d v0, v1, v2[1]
@@ -1377,6 +1417,19 @@ entry:
ret <2 x i64> %vmull2.i
}
+define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9a:
+; CHECK-NEXT: umull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
; CHECK-LABEL: bar0:
; CHECK: smlal2.8h v0, v1, v2
@@ -1667,6 +1720,24 @@ entry:
ret <2 x i64> %vmull2.i
}
+define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_low_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s
+; CHECK-NEXT: ret
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
entry:
; CHECK: vmull_high_n_s16_test
@@ -1804,8 +1875,21 @@ define <2 x i64> @mlal_from_two_extracts
ret <2 x i64> %sum
}
-define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: mull_from_extract_dup:
+define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_high:
; CHECK-NOT: ext
; CHECK: sqdmull2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@@ -1817,8 +1901,21 @@ define <2 x i64> @mull_from_extract_dup(
ret <2 x i64> %res
}
-define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup:
+define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+ %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+ %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_high:
; CHECK-NOT: ext
; CHECK: pmull2.8h
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
@@ -1830,8 +1927,20 @@ define <8 x i16> @pmull_from_extract_dup
ret <8 x i16> %res
}
-define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane:
+define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: pmull2.8h
@@ -1842,8 +1951,20 @@ define <8 x i16> @pmull_from_extract_dup
ret <8 x i16> %res
}
-define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmull_from_extract_duplane:
+define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: sqdmull2.2d
@@ -1854,8 +1975,21 @@ define <2 x i64> @sqdmull_from_extract_d
ret <2 x i64> %res
}
-define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmlal_from_extract_duplane:
+define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmlal.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: sqdmlal2.2d
@@ -1867,8 +2001,21 @@ define <2 x i64> @sqdmlal_from_extract_d
ret <2 x i64> %sum
}
-define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: umlal_from_extract_duplane:
+define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: umlal.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = add <2 x i64> %accum, %res
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: umlal2.2d
More information about the llvm-commits
mailing list