[llvm] [AArch64] Combine zext of deinterleaving shuffle. (PR #107201)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 4 01:56:15 PDT 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/107201
This is part 1 of a few patches that are intended to take deinterleaving shuffles with masks like `[0,4,8,12]`, where the shuffle is zero-extended to a larger size, and optimize away the deinterleave. In this case it converts them to `and(uzp1, mask)`, where the `uzp1` act upon the elements in the larger type size to get the lanes into the correct possitions, and the `and` performs the zext. It performs the combine fairly late, on the legalized type so that uitofp that are converted to uitofp(zext(..)) will also be handled.
>From 9cf7aea8a2fa57cef7cfc9162f453af30ebc1a62 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 4 Sep 2024 09:38:43 +0100
Subject: [PATCH] [AArch64] Combine zext of deinterleaving shuffle.
This is part 1 of a few patches that are intended to take deinterleaving
shuffles with masks like `[0,4,8,12]`, where the shuffle is zero-extended to a
larger size, and optimize away the deinterleave. In this case it converts them
to `and(uzp1, mask)`, where the `uzp1` act upon the elements in the larger type
size to get the lanes into the correct possitions, and the `and` performs the
zext. It performs the combine fairly late, on the legalized type so that uitofp
that are converted to uitofp(zext(..)) will also be handled.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 56 ++++
llvm/test/CodeGen/AArch64/zext-shuffle.ll | 286 +++++++-----------
2 files changed, 163 insertions(+), 179 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1735ff5cd69748..53a0e1c053a8ce 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22182,6 +22182,59 @@ performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
+// This comes from interleaved vectorization. It is performed late to capture
+// uitofp converts too.
+static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
+ N->getOpcode() != ISD::ZERO_EXTEND ||
+ N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return SDValue();
+
+ unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
+ if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
+ return SDValue();
+
+ EVT InVT = N->getOperand(0).getOperand(0).getValueType();
+ auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
+ if (!Shuffle ||
+ InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
+ InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
+ return SDValue();
+
+ unsigned Idx;
+ bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+ Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
+ // An undef interleave shuffle can come up after other canonicalizations,
+ // where the shuffle has been converted to
+ // zext(extract(shuffle b, undef, [u,u,0,4]))
+ bool IsUndefDeInterleave = false;
+ if (!IsDeInterleave)
+ IsUndefDeInterleave =
+ Shuffle->getOperand(1).isUndef() &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+ Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
+ VT.getVectorNumElements() / 2),
+ 4, Idx);
+ if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
+ return SDValue();
+ SDLoc DL(N);
+ SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+ Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
+ SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+ Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
+ SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
+ VT, BC1, BC2);
+ if ((Idx & 1) == 1)
+ UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
+ DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
+ return DAG.getNode(
+ ISD::AND, DL, VT, UZP,
+ DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -22202,6 +22255,9 @@ static SDValue performExtendCombine(SDNode *N,
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
}
+ if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
+ return R;
+
if (N->getValueType(0).isFixedLengthVector() &&
N->getOpcode() == ISD::SIGN_EXTEND &&
N->getOperand(0)->getOpcode() == ISD::SETCC)
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 4ef8daf141715b..af5a92017bbbca 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -76,12 +76,9 @@ define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) {
define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_04812:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI6_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ret
@@ -93,12 +90,8 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_15913:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI7_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ret
@@ -110,12 +103,9 @@ define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_261014:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI8_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ret
@@ -127,12 +117,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_371115:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI9_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ret
@@ -167,12 +153,9 @@ define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: v4i32_04812:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI12_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%d = zext <4 x i16> %c to <4 x i32>
@@ -182,12 +165,8 @@ define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: v4i32_15913:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI13_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
%d = zext <4 x i16> %c to <4 x i32>
@@ -197,12 +176,9 @@ define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: v4i32_261014:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI14_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
%d = zext <4 x i16> %c to <4 x i32>
@@ -212,12 +188,8 @@ define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @v4i32_371115(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: v4i32_371115:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI15_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
%d = zext <4 x i16> %c to <4 x i32>
@@ -249,12 +221,8 @@ define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i16_04812:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI18_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%d = zext <8 x i8> %c to <8 x i16>
@@ -264,12 +232,8 @@ define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i16_15913:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI19_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
%d = zext <8 x i8> %c to <8 x i16>
@@ -279,12 +243,8 @@ define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i16_261014:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI20_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
%d = zext <8 x i8> %c to <8 x i16>
@@ -294,12 +254,8 @@ define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i16_371115:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI21_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
%d = zext <8 x i8> %c to <8 x i16>
@@ -310,42 +266,23 @@ define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) {
define <8 x i64> @zext_add(<32 x i16> %l) {
; CHECK-LABEL: zext_add:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI22_0
-; CHECK-NEXT: adrp x9, .LCPI22_3
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT: adrp x8, .LCPI22_1
-; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI22_3]
-; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI22_1]
-; CHECK-NEXT: adrp x8, .LCPI22_2
-; CHECK-NEXT: adrp x9, .LCPI22_7
-; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI22_2]
-; CHECK-NEXT: adrp x8, .LCPI22_4
-; CHECK-NEXT: ldr q18, [x9, :lo12:.LCPI22_7]
-; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI22_4]
-; CHECK-NEXT: adrp x8, .LCPI22_5
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v5.16b, { v0.16b, v1.16b }, v5.16b
-; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI22_5]
-; CHECK-NEXT: adrp x8, .LCPI22_6
-; CHECK-NEXT: tbl v7.16b, { v0.16b, v1.16b }, v7.16b
-; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI22_6]
-; CHECK-NEXT: tbl v17.16b, { v0.16b, v1.16b }, v17.16b
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b
-; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b
-; CHECK-NEXT: tbl v4.16b, { v2.16b, v3.16b }, v6.16b
-; CHECK-NEXT: tbl v6.16b, { v2.16b, v3.16b }, v16.16b
-; CHECK-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v19.16b
-; CHECK-NEXT: uaddl v5.4s, v5.4h, v7.4h
-; CHECK-NEXT: uaddl v7.4s, v17.4h, v0.4h
-; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v4.8h
-; CHECK-NEXT: uaddl2 v2.4s, v6.8h, v2.8h
-; CHECK-NEXT: uaddl v0.2d, v5.2s, v7.2s
-; CHECK-NEXT: uaddl2 v1.2d, v5.4s, v7.4s
-; CHECK-NEXT: uaddl2 v3.2d, v4.4s, v2.4s
-; CHECK-NEXT: uaddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT: movi v4.2d, #0x00ffff0000ffff
+; CHECK-NEXT: uzp1 v5.4s, v0.4s, v1.4s
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: and v3.16b, v5.16b, v4.16b
+; CHECK-NEXT: and v6.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v4.16b, v2.16b, v4.16b
+; CHECK-NEXT: usra v3.4s, v5.4s, #16
+; CHECK-NEXT: usra v6.4s, v0.4s, #16
+; CHECK-NEXT: usra v7.4s, v1.4s, #16
+; CHECK-NEXT: usra v4.4s, v2.4s, #16
+; CHECK-NEXT: uaddl v0.2d, v3.2s, v6.2s
+; CHECK-NEXT: uaddl2 v1.2d, v3.4s, v6.4s
+; CHECK-NEXT: uaddl2 v3.2d, v7.4s, v4.4s
+; CHECK-NEXT: uaddl v2.2d, v7.2s, v4.2s
; CHECK-NEXT: ret
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%z1 = zext <8 x i16> %s1 to <8 x i64>
@@ -392,86 +329,77 @@ define <8 x i64> @zext_load_add(ptr %p) {
define <8 x double> @uitofp_fadd(<32 x i16> %l) {
; CHECK-LABEL: uitofp_fadd:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI24_0
-; CHECK-NEXT: adrp x9, .LCPI24_1
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: adrp x10, .LCPI24_6
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_0]
-; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI24_1]
-; CHECK-NEXT: adrp x8, .LCPI24_2
-; CHECK-NEXT: adrp x9, .LCPI24_3
-; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI24_2]
-; CHECK-NEXT: adrp x8, .LCPI24_4
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: tbl v4.16b, { v0.16b, v1.16b }, v4.16b
-; CHECK-NEXT: tbl v5.16b, { v2.16b, v3.16b }, v5.16b
-; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI24_3]
-; CHECK-NEXT: adrp x9, .LCPI24_5
-; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI24_4]
-; CHECK-NEXT: adrp x8, .LCPI24_7
-; CHECK-NEXT: ldr q17, [x9, :lo12:.LCPI24_5]
-; CHECK-NEXT: ldr q18, [x10, :lo12:.LCPI24_6]
-; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI24_7]
-; CHECK-NEXT: tbl v6.16b, { v0.16b, v1.16b }, v6.16b
-; CHECK-NEXT: tbl v7.16b, { v2.16b, v3.16b }, v7.16b
-; CHECK-NEXT: tbl v16.16b, { v0.16b, v1.16b }, v16.16b
-; CHECK-NEXT: tbl v17.16b, { v2.16b, v3.16b }, v17.16b
-; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b
-; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v19.16b
-; CHECK-NEXT: ushll2 v5.4s, v5.8h, #0
-; CHECK-NEXT: ushll v4.4s, v4.4h, #0
-; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0
-; CHECK-NEXT: ushll v6.4s, v6.4h, #0
-; CHECK-NEXT: ushll v16.4s, v16.4h, #0
-; CHECK-NEXT: ushll2 v20.2d, v5.4s, #0
-; CHECK-NEXT: ushll2 v21.2d, v4.4s, #0
-; CHECK-NEXT: ushll2 v17.4s, v17.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: ushll v2.2d, v5.2s, #0
-; CHECK-NEXT: ushll v3.2d, v4.2s, #0
-; CHECK-NEXT: ushll2 v4.2d, v7.4s, #0
-; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0
-; CHECK-NEXT: ushll v7.2d, v7.2s, #0
-; CHECK-NEXT: ucvtf v18.2d, v20.2d
-; CHECK-NEXT: ucvtf v19.2d, v21.2d
+; CHECK-NEXT: uzp1 v5.4s, v0.4s, v3.4s
+; CHECK-NEXT: uzp1 v6.4s, v0.4s, v1.4s
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: movi d4, #0x00ffff0000ffff
+; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8
+; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #8
+; CHECK-NEXT: uzp2 v1.4s, v0.4s, v3.4s
+; CHECK-NEXT: and v17.8b, v6.8b, v4.8b
+; CHECK-NEXT: and v18.8b, v7.8b, v4.8b
+; CHECK-NEXT: ushr v6.2s, v6.2s, #16
+; CHECK-NEXT: ushr v7.2s, v7.2s, #16
+; CHECK-NEXT: and v21.8b, v0.8b, v4.8b
+; CHECK-NEXT: and v22.8b, v2.8b, v4.8b
+; CHECK-NEXT: ushr v2.2s, v2.2s, #16
+; CHECK-NEXT: and v19.8b, v16.8b, v4.8b
+; CHECK-NEXT: and v20.8b, v5.8b, v4.8b
+; CHECK-NEXT: ushll v3.2d, v17.2s, #0
+; CHECK-NEXT: ushll v17.2d, v18.2s, #0
+; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ushr v16.2s, v16.2s, #16
+; CHECK-NEXT: ushr v5.2s, v5.2s, #16
; CHECK-NEXT: ushll v6.2d, v6.2s, #0
-; CHECK-NEXT: ushll2 v20.2d, v17.4s, #0
-; CHECK-NEXT: ushll2 v21.2d, v16.4s, #0
-; CHECK-NEXT: ushll v17.2d, v17.2s, #0
+; CHECK-NEXT: ushll v7.2d, v7.2s, #0
+; CHECK-NEXT: ushll v18.2d, v19.2s, #0
+; CHECK-NEXT: ushll v19.2d, v20.2s, #0
+; CHECK-NEXT: ext v20.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: ushll v16.2d, v16.2s, #0
-; CHECK-NEXT: ushll v22.2d, v0.2s, #0
-; CHECK-NEXT: ushll2 v23.2d, v1.4s, #0
-; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-NEXT: ushll v21.2d, v21.2s, #0
+; CHECK-NEXT: ushll v5.2d, v5.2s, #0
+; CHECK-NEXT: ushll v22.2d, v22.2s, #0
+; CHECK-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-NEXT: ucvtf v7.2d, v7.2d
-; CHECK-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-NEXT: ucvtf v20.2d, v20.2d
-; CHECK-NEXT: ucvtf v21.2d, v21.2d
; CHECK-NEXT: ucvtf v17.2d, v17.2d
+; CHECK-NEXT: ucvtf v6.2d, v6.2d
+; CHECK-NEXT: and v23.8b, v20.8b, v4.8b
+; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEXT: ushr v20.2s, v20.2s, #16
+; CHECK-NEXT: ushr v1.2s, v1.2s, #16
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-NEXT: ucvtf v18.2d, v18.2d
+; CHECK-NEXT: ucvtf v19.2d, v19.2d
; CHECK-NEXT: ucvtf v16.2d, v16.2d
+; CHECK-NEXT: ushll v23.2d, v23.2s, #0
+; CHECK-NEXT: ushll v4.2d, v4.2s, #0
+; CHECK-NEXT: ushll v20.2d, v20.2s, #0
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ucvtf v5.2d, v5.2d
+; CHECK-NEXT: ucvtf v21.2d, v21.2d
; CHECK-NEXT: ucvtf v22.2d, v22.2d
-; CHECK-NEXT: ucvtf v23.2d, v23.2d
; CHECK-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-NEXT: ucvtf v23.2d, v23.2d
+; CHECK-NEXT: ucvtf v4.2d, v4.2d
+; CHECK-NEXT: ucvtf v20.2d, v20.2d
; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fadd v4.2d, v18.2d, v4.2d
-; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d
-; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d
+; CHECK-NEXT: fadd v16.2d, v18.2d, v16.2d
+; CHECK-NEXT: fadd v7.2d, v17.2d, v7.2d
; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d
-; CHECK-NEXT: fadd v6.2d, v16.2d, v22.2d
-; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d
-; CHECK-NEXT: fadd v7.2d, v17.2d, v1.2d
-; CHECK-NEXT: fadd v1.2d, v21.2d, v0.2d
-; CHECK-NEXT: fadd v0.2d, v3.2d, v6.2d
-; CHECK-NEXT: fadd v3.2d, v4.2d, v16.2d
-; CHECK-NEXT: fadd v1.2d, v5.2d, v1.2d
-; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d
+; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d
+; CHECK-NEXT: fadd v0.2d, v21.2d, v0.2d
+; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d
+; CHECK-NEXT: fadd v4.2d, v4.2d, v1.2d
+; CHECK-NEXT: fadd v1.2d, v23.2d, v20.2d
+; CHECK-NEXT: fadd v0.2d, v3.2d, v0.2d
+; CHECK-NEXT: fadd v2.2d, v7.2d, v2.2d
+; CHECK-NEXT: fadd v1.2d, v16.2d, v1.2d
+; CHECK-NEXT: fadd v3.2d, v5.2d, v4.2d
; CHECK-NEXT: ret
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%z1 = uitofp <8 x i16> %s1 to <8 x double>
More information about the llvm-commits
mailing list