[llvm] [AArch64] Fold away zext of extract of uzp. (PR #107367)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 5 02:06:41 PDT 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/107367
Similar to #107201, this comes up from the lowering of zext of deinterleaving shuffles. Patterns such as ext(extract_subvector(uzp(a, b))) can be converted to a simple and to perform the extract/zext from a uzp1. Uzp2 can be handled with an extra shift, and due to the existing legalization we could have and / shift between which can be combined in.
Mostly this reduces instruction count or increases the amount of parallelism in the sequence.
>From 05c2af8fd9345a0c5e6d604e8526c4c59dfb222b Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 5 Sep 2024 10:03:33 +0100
Subject: [PATCH] [AArch64] Fold away zext of extract of uzp.
Similar to #107201, this comes up from the lowering of zext of deinterleaving
shuffles. Patterns such as ext(extract_subvector(uzp(a, b))) can be converted
to a simple and to perform the extract/zext from a uzp1. Uzp2 can be handled
with an extra shift, and due to the existing legalization we could have and /
shift between which can be combined in.
Mostly this reduces instruction count or increases the amount of parallelism in
the sequence.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 67 ++++++++
llvm/test/CodeGen/AArch64/zext-shuffle.ll | 162 ++++++++----------
2 files changed, 136 insertions(+), 93 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d1ddbfa300846b..4d25260506eb1c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22240,6 +22240,71 @@ static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
}
+// This comes up similar to the above when lowering deinterleaving shuffles from
+// zexts. We have legalized the operations in the generally case to
+// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
+// the extract is to the low half and the uzp is uzp1. There would be an extra
+// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
+// there could also be an existing and / shift that can be combined in, either
+// before of after the extract.
+static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (N->getOpcode() != ISD::ZERO_EXTEND ||
+ (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ unsigned ExtOffset = (unsigned)-1;
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ ExtOffset = Op.getConstantOperandVal(1);
+ Op = Op.getOperand(0);
+ }
+
+ unsigned Shift = 0;
+ APInt Mask = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
+ Op.getValueType().getScalarSizeInBits());
+
+ if (Op.getOpcode() == AArch64ISD::VLSHR) {
+ Shift = Op.getConstantOperandVal(1);
+ Op = Op.getOperand(0);
+ Mask = Mask.lshr(Shift);
+ }
+ if (Op.getOpcode() == ISD::AND &&
+ ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
+ Op = Op.getOperand(0);
+ Mask = Mask.zext(VT.getScalarSizeInBits());
+ }
+ else if (Op.getOpcode() == AArch64ISD::BICi) {
+ Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
+ Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
+ Mask = Mask.zext(VT.getScalarSizeInBits());
+ Op = Op.getOperand(0);
+ }
+
+ if (ExtOffset == (unsigned)-1) {
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ ExtOffset = Op.getConstantOperandVal(1);
+ Op = Op.getOperand(0);
+ } else
+ return SDValue();
+ }
+ if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
+ return SDValue();
+
+ if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
+ return SDValue();
+ if (Op.getOpcode() == AArch64ISD::UZP2)
+ Shift += VT.getScalarSizeInBits() / 2;
+
+ SDLoc DL(N);
+ SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+ Op.getOperand(ExtOffset == 0 ? 0 : 1));
+ if (Shift != 0)
+ BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
+ DAG.getConstant(Shift, DL, MVT::i32));
+ return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -22262,6 +22327,8 @@ static SDValue performExtendCombine(SDNode *N,
if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
return R;
+ if (SDValue R = performZExtUZPCombine(N, DAG))
+ return R;
if (N->getValueType(0).isFixedLengthVector() &&
N->getOpcode() == ISD::SIGN_EXTEND &&
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 6415fba29ff79b..6d25c874a28933 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -90,10 +90,11 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_15913:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ushr v0.4s, v0.4s, #16
-; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: movi v2.2d, #0x0000000000ffff
+; CHECK-NEXT: ushr v0.2d, v0.2d, #16
+; CHECK-NEXT: ushr v1.2d, v1.2d, #16
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
%z1 = zext <4 x i16> %s1 to <4 x i64>
@@ -117,10 +118,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_371115:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ushr v0.4s, v0.4s, #16
-; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: ushr v0.2d, v0.2d, #48
+; CHECK-NEXT: ushr v1.2d, v1.2d, #48
; CHECK-NEXT: ret
%s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
%z1 = zext <4 x i16> %s1 to <4 x i64>
@@ -142,8 +141,7 @@ define <4 x i32> @v4i32_0246(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: v4i32_1357:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%d = zext <4 x i16> %c to <4 x i32>
@@ -210,8 +208,7 @@ define <8 x i16> @v8i16_0246(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i16_1357:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%d = zext <8 x i8> %c to <8 x i16>
@@ -278,8 +275,7 @@ define <8 x i32> @v8i32_0246(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_1357:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ret
@@ -291,10 +287,9 @@ define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_04812:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%d = zext <8 x i8> %c to <8 x i32>
@@ -304,10 +299,11 @@ define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_15913:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ushr v0.8h, v0.8h, #8
-; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-NEXT: ushr v0.4s, v0.4s, #8
+; CHECK-NEXT: ushr v1.4s, v1.4s, #8
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
%d = zext <8 x i8> %c to <8 x i32>
@@ -317,10 +313,10 @@ define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_261014:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ushr v0.4s, v0.4s, #16
+; CHECK-NEXT: ushr v1.4s, v1.4s, #16
+; CHECK-NEXT: bic v0.4s, #255, lsl #8
+; CHECK-NEXT: bic v1.4s, #255, lsl #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
%d = zext <8 x i8> %c to <8 x i32>
@@ -330,10 +326,8 @@ define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_371115(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_371115:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ushr v0.8h, v0.8h, #8
-; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ushr v0.4s, v0.4s, #24
+; CHECK-NEXT: ushr v1.4s, v1.4s, #24
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
%d = zext <8 x i8> %c to <8 x i32>
@@ -407,77 +401,59 @@ define <8 x i64> @zext_load_add(ptr %p) {
define <8 x double> @uitofp_fadd(<32 x i16> %l) {
; CHECK-LABEL: uitofp_fadd:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp1 v5.4s, v0.4s, v3.4s
-; CHECK-NEXT: uzp1 v6.4s, v0.4s, v1.4s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi d4, #0x00ffff0000ffff
-; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #8
-; CHECK-NEXT: uzp2 v1.4s, v0.4s, v3.4s
-; CHECK-NEXT: and v17.8b, v6.8b, v4.8b
-; CHECK-NEXT: and v18.8b, v7.8b, v4.8b
-; CHECK-NEXT: ushr v6.2s, v6.2s, #16
-; CHECK-NEXT: ushr v7.2s, v7.2s, #16
-; CHECK-NEXT: and v21.8b, v0.8b, v4.8b
-; CHECK-NEXT: and v22.8b, v2.8b, v4.8b
-; CHECK-NEXT: ushr v2.2s, v2.2s, #16
-; CHECK-NEXT: and v19.8b, v16.8b, v4.8b
-; CHECK-NEXT: and v20.8b, v5.8b, v4.8b
-; CHECK-NEXT: ushll v3.2d, v17.2s, #0
-; CHECK-NEXT: ushll v17.2d, v18.2s, #0
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ushr v16.2s, v16.2s, #16
-; CHECK-NEXT: ushr v5.2s, v5.2s, #16
-; CHECK-NEXT: ushll v6.2d, v6.2s, #0
-; CHECK-NEXT: ushll v7.2d, v7.2s, #0
-; CHECK-NEXT: ushll v18.2d, v19.2s, #0
-; CHECK-NEXT: ushll v19.2d, v20.2s, #0
-; CHECK-NEXT: ext v20.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ushr v0.2s, v0.2s, #16
-; CHECK-NEXT: ushll v16.2d, v16.2s, #0
-; CHECK-NEXT: ushll v21.2d, v21.2s, #0
-; CHECK-NEXT: ushll v5.2d, v5.2s, #0
-; CHECK-NEXT: ushll v22.2d, v22.2s, #0
-; CHECK-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-NEXT: ucvtf v17.2d, v17.2d
-; CHECK-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-NEXT: and v23.8b, v20.8b, v4.8b
-; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT: ushr v20.2s, v20.2s, #16
-; CHECK-NEXT: ushr v1.2s, v1.2s, #16
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-NEXT: movi v4.2d, #0x0000000000ffff
+; CHECK-NEXT: ushr v5.2d, v0.2d, #16
+; CHECK-NEXT: ushr v6.2d, v1.2d, #16
+; CHECK-NEXT: ushr v7.2d, v2.2d, #16
+; CHECK-NEXT: ushr v17.2d, v3.2d, #16
+; CHECK-NEXT: ushr v20.2d, v0.2d, #32
+; CHECK-NEXT: ushr v22.2d, v1.2d, #32
+; CHECK-NEXT: ushr v23.2d, v2.2d, #32
+; CHECK-NEXT: ushr v24.2d, v3.2d, #32
+; CHECK-NEXT: and v16.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v19.16b, v2.16b, v4.16b
+; CHECK-NEXT: and v21.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v5.16b, v4.16b
+; CHECK-NEXT: and v6.16b, v6.16b, v4.16b
+; CHECK-NEXT: and v7.16b, v7.16b, v4.16b
+; CHECK-NEXT: and v17.16b, v17.16b, v4.16b
+; CHECK-NEXT: and v20.16b, v20.16b, v4.16b
+; CHECK-NEXT: and v22.16b, v22.16b, v4.16b
+; CHECK-NEXT: and v23.16b, v23.16b, v4.16b
+; CHECK-NEXT: and v4.16b, v24.16b, v4.16b
+; CHECK-NEXT: ushr v0.2d, v0.2d, #48
+; CHECK-NEXT: ushr v1.2d, v1.2d, #48
+; CHECK-NEXT: ushr v2.2d, v2.2d, #48
+; CHECK-NEXT: ushr v3.2d, v3.2d, #48
+; CHECK-NEXT: ucvtf v16.2d, v16.2d
; CHECK-NEXT: ucvtf v18.2d, v18.2d
; CHECK-NEXT: ucvtf v19.2d, v19.2d
-; CHECK-NEXT: ucvtf v16.2d, v16.2d
-; CHECK-NEXT: ushll v23.2d, v23.2s, #0
-; CHECK-NEXT: ushll v4.2d, v4.2s, #0
-; CHECK-NEXT: ushll v20.2d, v20.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
; CHECK-NEXT: ucvtf v21.2d, v21.2d
+; CHECK-NEXT: ucvtf v5.2d, v5.2d
+; CHECK-NEXT: ucvtf v6.2d, v6.2d
+; CHECK-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-NEXT: ucvtf v17.2d, v17.2d
+; CHECK-NEXT: ucvtf v20.2d, v20.2d
; CHECK-NEXT: ucvtf v22.2d, v22.2d
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
; CHECK-NEXT: ucvtf v23.2d, v23.2d
; CHECK-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-NEXT: ucvtf v20.2d, v20.2d
+; CHECK-NEXT: ucvtf v0.2d, v0.2d
; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fadd v16.2d, v18.2d, v16.2d
-; CHECK-NEXT: fadd v7.2d, v17.2d, v7.2d
-; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d
-; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d
-; CHECK-NEXT: fadd v0.2d, v21.2d, v0.2d
-; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d
-; CHECK-NEXT: fadd v4.2d, v4.2d, v1.2d
-; CHECK-NEXT: fadd v1.2d, v23.2d, v20.2d
-; CHECK-NEXT: fadd v0.2d, v3.2d, v0.2d
+; CHECK-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d
+; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d
+; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d
+; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d
+; CHECK-NEXT: fadd v0.2d, v20.2d, v0.2d
+; CHECK-NEXT: fadd v1.2d, v22.2d, v1.2d
+; CHECK-NEXT: fadd v3.2d, v4.2d, v3.2d
+; CHECK-NEXT: fadd v2.2d, v23.2d, v2.2d
+; CHECK-NEXT: fadd v0.2d, v5.2d, v0.2d
+; CHECK-NEXT: fadd v1.2d, v6.2d, v1.2d
; CHECK-NEXT: fadd v2.2d, v7.2d, v2.2d
-; CHECK-NEXT: fadd v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: fadd v3.2d, v5.2d, v4.2d
+; CHECK-NEXT: fadd v3.2d, v17.2d, v3.2d
; CHECK-NEXT: ret
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%z1 = uitofp <8 x i16> %s1 to <8 x double>
More information about the llvm-commits
mailing list