[llvm] [AArch64] Improve lowering of truncating uzp1 (PR #82457)
Usman Nadeem via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 27 20:29:13 PST 2024
https://github.com/UsmanNadeem updated https://github.com/llvm/llvm-project/pull/82457
>From 93aa87491ebc5402278699379dede34916ee9f37 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Tue, 20 Feb 2024 19:30:17 -0800
Subject: [PATCH 1/3] [AArch64] Improve lowering of truncating uzp1
This is related to/is a subset of PR 81960.
Instead of creating new build_vector nodes that remove the assert
sext/zext nodes as in PR 81960, I am handling it by adding to an
existing uzp1 combine to look through the assert ext nodes. This
should help preserve the extension information until very late in
the process.
Change-Id: Idb9f90cd9c4d8a9537509f2be619e17be564eef3
---
.../Target/AArch64/AArch64ISelLowering.cpp | 54 +++-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 9 +
.../CodeGen/AArch64/arm64-convert-v4f64.ll | 21 +-
.../CodeGen/AArch64/fp-conversion-to-tbl.ll | 5 +-
llvm/test/CodeGen/AArch64/fptoi.ll | 256 ++++++------------
llvm/test/CodeGen/AArch64/vcvt-oversize.ll | 5 +-
6 files changed, 145 insertions(+), 205 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8c5a4cdae11634..eb078ae499872e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21034,12 +21034,8 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
- // Only implemented on little-endian subtargets.
- bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
-
- // This optimization only works on little endian.
- if (!IsLittleEndian)
+ // These optimizations only works on little endian.
+ if (!DAG.getDataLayout().isLittleEndian())
return SDValue();
// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
@@ -21059,20 +21055,50 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
auto getSourceOp = [](SDValue Operand) -> SDValue {
- const unsigned Opcode = Operand.getOpcode();
- if (Opcode == ISD::TRUNCATE)
- return Operand->getOperand(0);
- if (Opcode == ISD::BITCAST &&
- Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
- return Operand->getOperand(0)->getOperand(0);
- return SDValue();
+ if (Operand.getOpcode() == ISD::BITCAST)
+ Operand = Operand->getOperand(0);
+ if (Operand.getOpcode() == ISD::AssertSext ||
+ Operand.getOpcode() == ISD::AssertZext)
+ Operand = Operand->getOperand(0);
+ return Operand;
};
SDValue SourceOp0 = getSourceOp(Op0);
SDValue SourceOp1 = getSourceOp(Op1);
- if (!SourceOp0 || !SourceOp1)
+ auto IsTruncatingUZP1Concat = [](SDNode *N, LLVMContext &Ctx) -> bool {
+ if (N->getOpcode() != AArch64ISD::UZP1)
+ return false;
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() != ISD::BITCAST || Op1.getOpcode() != ISD::BITCAST)
+ return false;
+
+ EVT ResVT = N->getValueType(0);
+ return ResVT.widenIntegerVectorElementType(Ctx).getHalfNumVectorElementsVT(
+ Ctx) == Op0.getOperand(0).getValueType();
+ };
+
+ // truncating uzp1(x=uzp1, y=uzp1) -> trunc(concat (x, y))
+ // This is similar to the transform below, except that it looks for truncation
+ // done using the uzp1 node.
+ LLVMContext &Ctx = *DAG.getContext();
+ if (IsTruncatingUZP1Concat(N, Ctx) &&
+ IsTruncatingUZP1Concat(SourceOp0.getNode(), Ctx) &&
+ IsTruncatingUZP1Concat(SourceOp1.getNode(), Ctx)) {
+ SDValue UZP1 =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL,
+ SourceOp0.getValueType().getDoubleNumVectorElementsVT(Ctx),
+ SourceOp0, SourceOp1);
+ return DAG.getNode(ISD::TRUNCATE, DL, ResVT, UZP1);
+ }
+
+ // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
+ if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
+ SourceOp1.getOpcode() != ISD::TRUNCATE)
return SDValue();
+ SourceOp0 = SourceOp0.getOperand(0);
+ SourceOp1 = SourceOp1.getOperand(0);
if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
!SourceOp0.getValueType().isSimple())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8c2a852850320f..270b7ba9ccbead 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6125,6 +6125,15 @@ def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
(v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
(UZP1v4i32 V128:$Vn, V128:$Vm)>;
+def : Pat<(v16i8 (concat_vectors (v8i8 (assertsext (trunc (v8i16 V128:$Vn)))),
+ (v8i8 (assertsext (trunc (v8i16 V128:$Vm)))))),
+ (UZP1v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 (assertsext (trunc (v4i32 V128:$Vn)))),
+ (v4i16 (assertsext (trunc (v4i32 V128:$Vm)))))),
+ (UZP1v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 (assertsext (trunc (v2i64 V128:$Vn)))),
+ (v2i32 (assertsext (trunc (v2i64 V128:$Vm)))))),
+ (UZP1v4i32 V128:$Vn, V128:$Vm)>;
def : Pat<(v16i8 (concat_vectors
(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 9bf638f57a5120..193e3b0cfbc7bc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x double>, ptr %ptr
%tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v3.2s, v3.2d
-; CHECK-NEXT: xtn v2.2s, v2.2d
-; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%tmp1 = load <8 x double>, ptr %ptr
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -72,9 +68,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x double>, ptr %ptr
%tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1ea87bb6b04b51..0a3b9a070c2b32 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: fcvtzs.4s v1, v1
; CHECK-NEXT: fcvtzs.4s v0, v0
-; CHECK-NEXT: xtn.4h v1, v1
-; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: uzp1.8b v0, v0, v1
+; CHECK-NEXT: uzp1.8h v0, v0, v1
+; CHECK-NEXT: xtn.8b v0, v0
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 251719c1e3b430..3feb65e4eee66d 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1096,30 +1096,17 @@ entry:
}
define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-SD-LABEL: fptos_v3f64_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptos_v3f64_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptos_v3f64_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptosi <3 x double> %a to <3 x i16>
ret <3 x i16> %c
@@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v3f64_v3i16:
@@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v4f64_v4i16:
@@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v4f64_v4i16:
@@ -1600,9 +1584,8 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: umov w1, v0.h[1]
; CHECK-SD-NEXT: umov w2, v0.h[2]
@@ -1638,9 +1621,8 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: umov w1, v0.h[1]
; CHECK-SD-NEXT: umov w2, v0.h[2]
@@ -1672,9 +1654,8 @@ define <4 x i8> @fptos_v4f64_v4i8(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v4f64_v4i8:
@@ -1694,9 +1675,8 @@ define <4 x i8> @fptou_v4f64_v4i8(<4 x double> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: xtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v4f64_v4i8:
@@ -1718,13 +1698,10 @@ define <8 x i8> @fptos_v8f64_v8i8(<8 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v8f64_v8i8:
@@ -1750,13 +1727,10 @@ define <8 x i8> @fptou_v8f64_v8i8(<8 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v8f64_v8i8:
@@ -1786,21 +1760,13 @@ define <16 x i8> @fptos_v16f64_v16i8(<16 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v16f64_v16i8:
@@ -1837,21 +1803,13 @@ define <16 x i8> @fptou_v16f64_v16i8(<16 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v16f64_v16i8:
@@ -1900,36 +1858,20 @@ define <32 x i8> @fptos_v32f64_v32i8(<32 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: xtn v23.2s, v23.2d
-; CHECK-SD-NEXT: xtn v22.2s, v22.2d
-; CHECK-SD-NEXT: xtn v21.2s, v21.2d
-; CHECK-SD-NEXT: xtn v20.2s, v20.2d
-; CHECK-SD-NEXT: xtn v19.2s, v19.2d
-; CHECK-SD-NEXT: xtn v18.2s, v18.2d
-; CHECK-SD-NEXT: xtn v17.2s, v17.2d
-; CHECK-SD-NEXT: xtn v16.2s, v16.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT: uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT: uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT: uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT: mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT: uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT: uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT: uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT: uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT: uzp1 v2.8h, v7.8h, v5.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT: uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f64_v32i8:
@@ -1997,36 +1939,20 @@ define <32 x i8> @fptou_v32f64_v32i8(<32 x double> %a) {
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT: xtn v7.2s, v7.2d
-; CHECK-SD-NEXT: xtn v6.2s, v6.2d
-; CHECK-SD-NEXT: xtn v5.2s, v5.2d
-; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: xtn v3.2s, v3.2d
-; CHECK-SD-NEXT: xtn v2.2s, v2.2d
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: xtn v0.2s, v0.2d
-; CHECK-SD-NEXT: xtn v23.2s, v23.2d
-; CHECK-SD-NEXT: xtn v22.2s, v22.2d
-; CHECK-SD-NEXT: xtn v21.2s, v21.2d
-; CHECK-SD-NEXT: xtn v20.2s, v20.2d
-; CHECK-SD-NEXT: xtn v19.2s, v19.2d
-; CHECK-SD-NEXT: xtn v18.2s, v18.2d
-; CHECK-SD-NEXT: xtn v17.2s, v17.2d
-; CHECK-SD-NEXT: xtn v16.2s, v16.2d
-; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT: uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT: uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT: uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT: mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT: mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT: mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT: uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT: uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT: uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT: uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT: uzp1 v2.8h, v7.8h, v5.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT: uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v32f64_v32i8:
@@ -3028,9 +2954,8 @@ define <8 x i8> @fptos_v8f32_v8i8(<8 x float> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v8f32_v8i8:
@@ -3050,9 +2975,8 @@ define <8 x i8> @fptou_v8f32_v8i8(<8 x float> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: xtn v0.8b, v0.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v8f32_v8i8:
@@ -3074,12 +2998,8 @@ define <16 x i8> @fptos_v16f32_v16i8(<16 x float> %a) {
; CHECK-SD-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v3.4h, v3.4s
-; CHECK-SD-NEXT: xtn v2.4h, v2.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
@@ -3136,20 +3056,12 @@ define <32 x i8> @fptos_v32f32_v32i8(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtzs v6.4s, v6.4s
; CHECK-SD-NEXT: fcvtzs v5.4s, v5.4s
; CHECK-SD-NEXT: fcvtzs v4.4s, v4.4s
-; CHECK-SD-NEXT: xtn v3.4h, v3.4s
-; CHECK-SD-NEXT: xtn v2.4h, v2.4s
-; CHECK-SD-NEXT: xtn v1.4h, v1.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: xtn v7.4h, v7.4s
-; CHECK-SD-NEXT: xtn v6.4h, v6.4s
-; CHECK-SD-NEXT: xtn v5.4h, v5.4s
-; CHECK-SD-NEXT: xtn v4.4h, v4.4s
-; CHECK-SD-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT: mov v4.d[1], v5.d[0]
+; CHECK-SD-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-SD-NEXT: uzp1 v3.8h, v4.8h, v5.8h
; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT: uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT: uzp1 v1.16b, v3.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f32_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
index 380bdbcc7f7408..611940546bc1a2 100644
--- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -9,9 +9,8 @@ define <8 x i8> @float_to_i8(ptr %in) {
; CHECK-NEXT: fadd v0.4s, v0.4s, v0.4s
; CHECK-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-NEXT: fcvtzs v1.4s, v1.4s
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%l = load <8 x float>, ptr %in
%scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
>From 0a08e0b10a01f69906669991eadf6d5f6dd26fab Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Wed, 21 Feb 2024 10:01:27 -0800
Subject: [PATCH 2/3] Check bitcast source type
Change-Id: Ib1018cdc616550d41a6cdb9bdde07654f0927f33
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb078ae499872e..ba56b0b308e198 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21073,10 +21073,13 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() != ISD::BITCAST || Op1.getOpcode() != ISD::BITCAST)
return false;
+ EVT Op0Ty = Op0.getOperand(0).getValueType();
+ if (Op0Ty != Op1.getOperand(0).getValueType())
+ return false;
EVT ResVT = N->getValueType(0);
return ResVT.widenIntegerVectorElementType(Ctx).getHalfNumVectorElementsVT(
- Ctx) == Op0.getOperand(0).getValueType();
+ Ctx) == Op0Ty;
};
// truncating uzp1(x=uzp1, y=uzp1) -> trunc(concat (x, y))
>From ffe78588f1919d63c6a5c7352ca737b64dc5816f Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Tue, 27 Feb 2024 20:24:17 -0800
Subject: [PATCH 3/3] simplify code, address comments
Change-Id: I75f977c199faf9d8ed669db24feedf338ec0a3d8
---
.../Target/AArch64/AArch64ISelLowering.cpp | 54 +++++-----------
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 62 ++++++++++---------
llvm/test/CodeGen/AArch64/extbinopload.ll | 31 +++++-----
llvm/test/CodeGen/AArch64/neon-truncstore.ll | 5 +-
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 2 +-
llvm/test/CodeGen/AArch64/shuffle-tbl34.ll | 14 +++--
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 2 +-
llvm/test/CodeGen/AArch64/tbl-loops.ll | 4 +-
llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 28 ++++-----
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 2 +-
llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 2 +-
.../vec-combine-compare-truncate-store.ll | 2 +-
.../AArch64/vec3-loads-ext-trunc-stores.ll | 22 +++----
13 files changed, 106 insertions(+), 124 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ba56b0b308e198..b15d652ebfc78b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21034,7 +21034,7 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // These optimizations only works on little endian.
+ // These optimizations only work on little endian.
if (!DAG.getDataLayout().isLittleEndian())
return SDValue();
@@ -21054,46 +21054,20 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
return SDValue();
- auto getSourceOp = [](SDValue Operand) -> SDValue {
- if (Operand.getOpcode() == ISD::BITCAST)
- Operand = Operand->getOperand(0);
- if (Operand.getOpcode() == ISD::AssertSext ||
- Operand.getOpcode() == ISD::AssertZext)
- Operand = Operand->getOperand(0);
- return Operand;
- };
-
- SDValue SourceOp0 = getSourceOp(Op0);
- SDValue SourceOp1 = getSourceOp(Op1);
-
- auto IsTruncatingUZP1Concat = [](SDNode *N, LLVMContext &Ctx) -> bool {
- if (N->getOpcode() != AArch64ISD::UZP1)
- return false;
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- if (Op0.getOpcode() != ISD::BITCAST || Op1.getOpcode() != ISD::BITCAST)
- return false;
- EVT Op0Ty = Op0.getOperand(0).getValueType();
- if (Op0Ty != Op1.getOperand(0).getValueType())
- return false;
-
- EVT ResVT = N->getValueType(0);
- return ResVT.widenIntegerVectorElementType(Ctx).getHalfNumVectorElementsVT(
- Ctx) == Op0Ty;
- };
+ SDValue SourceOp0 = peekThroughBitcasts(Op0);
+ SDValue SourceOp1 = peekThroughBitcasts(Op1);
- // truncating uzp1(x=uzp1, y=uzp1) -> trunc(concat (x, y))
- // This is similar to the transform below, except that it looks for truncation
- // done using the uzp1 node.
- LLVMContext &Ctx = *DAG.getContext();
- if (IsTruncatingUZP1Concat(N, Ctx) &&
- IsTruncatingUZP1Concat(SourceOp0.getNode(), Ctx) &&
- IsTruncatingUZP1Concat(SourceOp1.getNode(), Ctx)) {
- SDValue UZP1 =
- DAG.getNode(ISD::CONCAT_VECTORS, DL,
- SourceOp0.getValueType().getDoubleNumVectorElementsVT(Ctx),
- SourceOp0, SourceOp1);
- return DAG.getNode(ISD::TRUNCATE, DL, ResVT, UZP1);
+ // truncating uzp1(x, y) -> xtn(concat (x, y))
+ if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
+ EVT Op0Ty = SourceOp0.getValueType();
+ if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
+ (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL,
+ Op0Ty.getDoubleNumVectorElementsVT(*DAG.getContext()),
+ SourceOp0, SourceOp1);
+ return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
+ }
}
// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 270b7ba9ccbead..cf7995eda540ba 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6105,35 +6105,39 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
-def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))),
- (v8i8 (trunc (v8i16 V128:$Vm))))),
- (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
- (v4i16 (trunc (v4i32 V128:$Vm))))),
- (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
- (v2i32 (trunc (v2i64 V128:$Vm))))),
- (UZP1v4i32 V128:$Vn, V128:$Vm)>;
-// These are the same as above, with an optional assertzext node that can be
-// generated from fptoi lowering.
-def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
- (v8i8 (assertzext (trunc (v8i16 V128:$Vm)))))),
- (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
- (v4i16 (assertzext (trunc (v4i32 V128:$Vm)))))),
- (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
- (v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
- (UZP1v4i32 V128:$Vn, V128:$Vm)>;
-def : Pat<(v16i8 (concat_vectors (v8i8 (assertsext (trunc (v8i16 V128:$Vn)))),
- (v8i8 (assertsext (trunc (v8i16 V128:$Vm)))))),
- (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (assertsext (trunc (v4i32 V128:$Vn)))),
- (v4i16 (assertsext (trunc (v4i32 V128:$Vm)))))),
- (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (assertsext (trunc (v2i64 V128:$Vn)))),
- (v2i32 (assertsext (trunc (v2i64 V128:$Vm)))))),
- (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+def trunc_optional_assert_ext : PatFrags<(ops node:$op0),
+ [(trunc node:$op0),
+ (assertzext (trunc node:$op0)),
+ (assertsext (trunc node:$op0))]>;
+
+// concat_vectors(trunc(x), trunc(y)) -> uzp1(x, y)
+// concat_vectors(assertzext(trunc(x)), assertzext(trunc(y))) -> uzp1(x, y)
+// concat_vectors(assertsext(trunc(x)), assertsext(trunc(y))) -> uzp1(x, y)
+class concat_trunc_to_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy>
+ : Pat<(ConcatTy (concat_vectors (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
+ (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))),
+ (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm)>;
+def : concat_trunc_to_uzp1_pat<v8i16, v8i8, v16i8>;
+def : concat_trunc_to_uzp1_pat<v4i32, v4i16, v8i16>;
+def : concat_trunc_to_uzp1_pat<v2i64, v2i32, v4i32>;
+
+// trunc(concat_vectors(trunc(x), trunc(y))) -> xtn(uzp1(x, y))
+// trunc(concat_vectors(assertzext(trunc(x)), assertzext(trunc(y)))) -> xtn(uzp1(x, y))
+// trunc(concat_vectors(assertsext(trunc(x)), assertsext(trunc(y)))) -> xtn(uzp1(x, y))
+class trunc_concat_trunc_to_xtn_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy,
+ ValueType Ty>
+ : Pat<(Ty (trunc_optional_assert_ext
+ (ConcatTy (concat_vectors
+ (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
+ (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))))),
+ (!cast<Instruction>("XTN"#Ty) (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm))>;
+def : trunc_concat_trunc_to_xtn_uzp1_pat<v4i32, v4i16, v8i16, v8i8>;
+def : trunc_concat_trunc_to_xtn_uzp1_pat<v2i64, v2i32, v4i32, v4i16>;
+
+def : Pat<(v8i8 (trunc (concat_vectors (v4i16 V64:$Vn), (v4i16 V64:$Vm)))),
+ (UZP1v8i8 V64:$Vn, V64:$Vm)>;
+def : Pat<(v4i16 (trunc (concat_vectors (v2i32 V64:$Vn), (v2i32 V64:$Vm)))),
+ (UZP1v4i16 V64:$Vn, V64:$Vm)>;
def : Pat<(v16i8 (concat_vectors
(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 1f68c77611e10d..dff4831330deb0 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -650,7 +650,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x11, x3, #12
; CHECK-NEXT: str s1, [x4]
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ldp s0, s5, [x2]
+; CHECK-NEXT: ldp s0, s4, [x2]
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: umov w9, v2.h[0]
; CHECK-NEXT: umov w10, v2.h[1]
@@ -662,24 +662,25 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: mov v0.b[10], w9
; CHECK-NEXT: add x9, x1, #4
-; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: mov v1.d[1], v2.d[0]
; CHECK-NEXT: mov v0.b[11], w10
; CHECK-NEXT: add x10, x1, #12
+; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT: ldr s4, [x0, #12]
-; CHECK-NEXT: ldp s3, s16, [x0, #4]
-; CHECK-NEXT: ld1 { v5.s }[1], [x3]
-; CHECK-NEXT: ldp s6, s7, [x2, #8]
-; CHECK-NEXT: ld1 { v4.s }[1], [x10]
-; CHECK-NEXT: ld1 { v3.s }[1], [x9]
-; CHECK-NEXT: ld1 { v6.s }[1], [x8]
-; CHECK-NEXT: ld1 { v7.s }[1], [x11]
+; CHECK-NEXT: ldr s3, [x0, #12]
+; CHECK-NEXT: ldp s2, s7, [x0, #4]
+; CHECK-NEXT: ld1 { v4.s }[1], [x3]
+; CHECK-NEXT: ldp s5, s6, [x2, #8]
+; CHECK-NEXT: ld1 { v3.s }[1], [x10]
+; CHECK-NEXT: ld1 { v2.s }[1], [x9]
+; CHECK-NEXT: ld1 { v5.s }[1], [x8]
+; CHECK-NEXT: ld1 { v6.s }[1], [x11]
; CHECK-NEXT: add x8, x1, #8
-; CHECK-NEXT: ld1 { v16.s }[1], [x8]
-; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b
-; CHECK-NEXT: ushll v3.8h, v6.8b, #0
-; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b
-; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b
+; CHECK-NEXT: ld1 { v7.s }[1], [x8]
+; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: ushll v3.8h, v5.8b, #0
+; CHECK-NEXT: uaddl v4.8h, v4.8b, v6.8b
+; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b
; CHECK-NEXT: ushll v0.4s, v2.4h, #3
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index b677d077b98c14..5d78ad24eb333d 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -104,7 +104,7 @@ define void @v4i32_v4i8(<4 x i32> %a, ptr %result) {
; CHECK-LABEL: v4i32_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
%b = trunc <4 x i32> %a to <4 x i8>
@@ -170,8 +170,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) {
define void @v4i16_v4i8(<4 x i16> %a, ptr %result) {
; CHECK-LABEL: v4i16_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
%b = trunc <4 x i16> %a to <4 x i8>
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 5f905d94e3573c..6f1ae023bf25a1 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -145,7 +145,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
index 0ef64789ad9724..fb571eff39fe50 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -353,13 +353,17 @@ define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x
define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; CHECK-LABEL: shuffle4_v4i8_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: uzp1 v1.8b, v2.8b, v3.8b
+; CHECK-NEXT: fmov d5, d2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: adrp x8, .LCPI8_0
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-NEXT: fmov d4, d0
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-NEXT: mov v4.d[1], v1.d[0]
+; CHECK-NEXT: mov v5.d[1], v3.d[0]
+; CHECK-NEXT: bic v4.8h, #255, lsl #8
+; CHECK-NEXT: bic v5.8h, #255, lsl #8
+; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b }, v0.16b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index acec3e74d3e93f..d1f843a09f749d 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -146,7 +146,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 4f8a4f7aede3eb..0ad99008655184 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -41,8 +41,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: xtn v2.8b, v2.8h
+; CHECK-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-NEXT: uzp1 v2.8b, v2.8b, v0.8b
; CHECK-NEXT: mov v1.s[1], v2.s[0]
; CHECK-NEXT: stur d1, [x12, #-4]
; CHECK-NEXT: add x12, x12, #8
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index ba367b0dbfde34..18cd4cc2111a42 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -710,23 +710,23 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q4, q1, [x0, #48]
-; CHECK-NEXT: add x9, x1, #8
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: add x9, x1, #10
; CHECK-NEXT: ldr d0, [x0, #80]
+; CHECK-NEXT: ldp q3, q2, [x0]
; CHECK-NEXT: ldr q5, [x0, #32]
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: add x0, x0, #128
-; CHECK-NEXT: uzp1.4s v4, v5, v4
-; CHECK-NEXT: uzp1.4s v2, v3, v2
; CHECK-NEXT: uzp1.4s v0, v1, v0
-; CHECK-NEXT: uzp1.8h v1, v2, v4
+; CHECK-NEXT: uzp1.4s v1, v5, v4
+; CHECK-NEXT: uzp1.4s v2, v3, v2
; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: uzp1.16b v1, v1, v0
-; CHECK-NEXT: xtn.8b v0, v0
-; CHECK-NEXT: st1.h { v1 }[4], [x9]
-; CHECK-NEXT: add x9, x1, #10
-; CHECK-NEXT: st1.b { v0 }[2], [x9]
-; CHECK-NEXT: str d1, [x1], #16
+; CHECK-NEXT: uzp1.8h v1, v2, v1
+; CHECK-NEXT: uzp1.8b v2, v0, v0
+; CHECK-NEXT: uzp1.16b v0, v1, v0
+; CHECK-NEXT: st1.b { v2 }[2], [x9]
+; CHECK-NEXT: add x9, x1, #8
+; CHECK-NEXT: st1.h { v0 }[4], [x9]
+; CHECK-NEXT: str d0, [x1], #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -755,7 +755,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: xtn v0.4h, v0.4s
; CHECK-BE-NEXT: uzp1 v1.8h, v1.8h, v2.8h
; CHECK-BE-NEXT: uzp1 v1.16b, v1.16b, v0.16b
-; CHECK-BE-NEXT: xtn v0.8b, v0.8h
+; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-BE-NEXT: rev16 v2.16b, v1.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
; CHECK-BE-NEXT: st1 { v0.b }[2], [x9]
@@ -790,7 +790,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: xtn v0.4h, v0.4s
; CHECK-DISABLE-NEXT: uzp1 v1.8h, v1.8h, v2.8h
; CHECK-DISABLE-NEXT: uzp1 v1.16b, v1.16b, v0.16b
-; CHECK-DISABLE-NEXT: xtn v0.8b, v0.8h
+; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b
; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b
; CHECK-DISABLE-NEXT: st1 { v0.b }[2], [x9]
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index e05c65daf50aae..f0bbed59405e3f 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -142,7 +142,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: movi d0, #0xff00ff00ff00ff
; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b
; CHECK-NEXT: umin v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 05f43e7d8427b7..82c0327219f504 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -143,7 +143,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index 9c6ab8da0fa756..dd7a9c6d7768bc 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -210,7 +210,7 @@ define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
; CHECK-LABEL: no_combine_for_non_bool_truncate:
; CHECK: ; %bb.0:
; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: xtn.8b v0, v0
+; CHECK-NEXT: uzp1.8b v0, v0, v0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 90328f73f86b51..71d55df6651768 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -410,7 +410,7 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: ldrh w8, [x0, #4]
; BE-NEXT: rev32 v0.4h, v0.4h
; BE-NEXT: mov v0.h[2], w8
-; BE-NEXT: xtn v0.8b, v0.8h
+; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; BE-NEXT: rev32 v0.16b, v0.16b
; BE-NEXT: str s0, [sp, #12]
; BE-NEXT: ldrh w9, [sp, #12]
@@ -456,7 +456,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: add x8, x8, :lo12:.LCPI11_0
; BE-NEXT: ld1 { v1.4h }, [x8]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -638,7 +638,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -672,7 +672,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -706,7 +706,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -741,7 +741,7 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -777,7 +777,7 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -801,7 +801,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: shrn.4h v0, v0, #16
-; CHECK-NEXT: xtn.8b v1, v0
+; CHECK-NEXT: uzp1.8b v1, v0, v0
; CHECK-NEXT: umov.h w8, v0[2]
; CHECK-NEXT: str s1, [sp, #12]
; CHECK-NEXT: ldrh w9, [sp, #12]
@@ -816,7 +816,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; BE-NEXT: .cfi_def_cfa_offset 16
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
@@ -868,7 +868,7 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
@@ -921,7 +921,7 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
-; BE-NEXT: xtn v1.8b, v0.8h
+; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: umov w8, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
More information about the llvm-commits
mailing list