[llvm] [AArch64] Improve lowering of truncating uzp1 (PR #82457)

Usman Nadeem via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 21 10:01:53 PST 2024


https://github.com/UsmanNadeem updated https://github.com/llvm/llvm-project/pull/82457

>From 93aa87491ebc5402278699379dede34916ee9f37 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Tue, 20 Feb 2024 19:30:17 -0800
Subject: [PATCH 1/2] [AArch64] Improve lowering of truncating uzp1

This is related to/is a subset of PR 81960.

Instead of creating new build_vector nodes that remove the assert
sext/zext nodes as in PR 81960, I am handling it by adding to an
existing uzp1 combine to look through the assert ext nodes. This
should help preserve the extension information until very late in
the process.

Change-Id: Idb9f90cd9c4d8a9537509f2be619e17be564eef3
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  54 +++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   9 +
 .../CodeGen/AArch64/arm64-convert-v4f64.ll    |  21 +-
 .../CodeGen/AArch64/fp-conversion-to-tbl.ll   |   5 +-
 llvm/test/CodeGen/AArch64/fptoi.ll            | 256 ++++++------------
 llvm/test/CodeGen/AArch64/vcvt-oversize.ll    |   5 +-
 6 files changed, 145 insertions(+), 205 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8c5a4cdae11634..eb078ae499872e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21034,12 +21034,8 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
-  // Only implemented on little-endian subtargets.
-  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
-
-  // This optimization only works on little endian.
-  if (!IsLittleEndian)
+  // These optimizations only works on little endian.
+  if (!DAG.getDataLayout().isLittleEndian())
     return SDValue();
 
   // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
@@ -21059,20 +21055,50 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   auto getSourceOp = [](SDValue Operand) -> SDValue {
-    const unsigned Opcode = Operand.getOpcode();
-    if (Opcode == ISD::TRUNCATE)
-      return Operand->getOperand(0);
-    if (Opcode == ISD::BITCAST &&
-        Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
-      return Operand->getOperand(0)->getOperand(0);
-    return SDValue();
+    if (Operand.getOpcode() == ISD::BITCAST)
+      Operand = Operand->getOperand(0);
+    if (Operand.getOpcode() == ISD::AssertSext ||
+        Operand.getOpcode() == ISD::AssertZext)
+      Operand = Operand->getOperand(0);
+    return Operand;
   };
 
   SDValue SourceOp0 = getSourceOp(Op0);
   SDValue SourceOp1 = getSourceOp(Op1);
 
-  if (!SourceOp0 || !SourceOp1)
+  auto IsTruncatingUZP1Concat = [](SDNode *N, LLVMContext &Ctx) -> bool {
+    if (N->getOpcode() != AArch64ISD::UZP1)
+      return false;
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    if (Op0.getOpcode() != ISD::BITCAST || Op1.getOpcode() != ISD::BITCAST)
+      return false;
+
+    EVT ResVT = N->getValueType(0);
+    return ResVT.widenIntegerVectorElementType(Ctx).getHalfNumVectorElementsVT(
+               Ctx) == Op0.getOperand(0).getValueType();
+  };
+
+  // truncating uzp1(x=uzp1, y=uzp1) -> trunc(concat (x, y))
+  // This is similar to the transform below, except that it looks for truncation
+  // done using the uzp1 node.
+  LLVMContext &Ctx = *DAG.getContext();
+  if (IsTruncatingUZP1Concat(N, Ctx) &&
+      IsTruncatingUZP1Concat(SourceOp0.getNode(), Ctx) &&
+      IsTruncatingUZP1Concat(SourceOp1.getNode(), Ctx)) {
+    SDValue UZP1 =
+        DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                    SourceOp0.getValueType().getDoubleNumVectorElementsVT(Ctx),
+                    SourceOp0, SourceOp1);
+    return DAG.getNode(ISD::TRUNCATE, DL, ResVT, UZP1);
+  }
+
+  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
+  if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
+      SourceOp1.getOpcode() != ISD::TRUNCATE)
     return SDValue();
+  SourceOp0 = SourceOp0.getOperand(0);
+  SourceOp1 = SourceOp1.getOperand(0);
 
   if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
       !SourceOp0.getValueType().isSimple())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8c2a852850320f..270b7ba9ccbead 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6125,6 +6125,15 @@ def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
 def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
                                  (v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
           (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+def : Pat<(v16i8 (concat_vectors (v8i8 (assertsext (trunc (v8i16 V128:$Vn)))),
+                                 (v8i8 (assertsext (trunc (v8i16 V128:$Vm)))))),
+          (UZP1v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 (assertsext (trunc (v4i32 V128:$Vn)))),
+                                 (v4i16 (assertsext (trunc (v4i32 V128:$Vm)))))),
+          (UZP1v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 (assertsext (trunc (v2i64 V128:$Vn)))),
+                                 (v2i32 (assertsext (trunc (v2i64 V128:$Vm)))))),
+          (UZP1v4i32 V128:$Vn, V128:$Vm)>;
 
 def : Pat<(v16i8 (concat_vectors
                  (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 9bf638f57a5120..193e3b0cfbc7bc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x double>, ptr %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -72,9 +68,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1ea87bb6b04b51..0a3b9a070c2b32 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs.4s v1, v1
 ; CHECK-NEXT:    fcvtzs.4s v0, v0
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    uzp1.8b v0, v0, v1
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 251719c1e3b430..3feb65e4eee66d 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1096,30 +1096,17 @@ entry:
 }
 
 define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-SD-LABEL: fptos_v3f64_v3i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptos_v3f64_v3i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptos_v3f64_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
 entry:
   %c = fptosi <3 x double> %a to <3 x i16>
   ret <3 x i16> %c
@@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v3f64_v3i16:
@@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v4f64_v4i16:
@@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v4f64_v4i16:
@@ -1600,9 +1584,8 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    umov w1, v0.h[1]
 ; CHECK-SD-NEXT:    umov w2, v0.h[2]
@@ -1638,9 +1621,8 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    umov w1, v0.h[1]
 ; CHECK-SD-NEXT:    umov w2, v0.h[2]
@@ -1672,9 +1654,8 @@ define <4 x i8> @fptos_v4f64_v4i8(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v4f64_v4i8:
@@ -1694,9 +1675,8 @@ define <4 x i8> @fptou_v4f64_v4i8(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v4f64_v4i8:
@@ -1718,13 +1698,10 @@ define <8 x i8> @fptos_v8f64_v8i8(<8 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f64_v8i8:
@@ -1750,13 +1727,10 @@ define <8 x i8> @fptou_v8f64_v8i8(<8 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f64_v8i8:
@@ -1786,21 +1760,13 @@ define <16 x i8> @fptos_v16f64_v16i8(<16 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v16f64_v16i8:
@@ -1837,21 +1803,13 @@ define <16 x i8> @fptou_v16f64_v16i8(<16 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v16f64_v16i8:
@@ -1900,36 +1858,20 @@ define <32 x i8> @fptos_v32f64_v32i8(<32 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
 ; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v23.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v19.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v16.2s, v16.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT:    uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT:    uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT:    uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v7.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f64_v32i8:
@@ -1997,36 +1939,20 @@ define <32 x i8> @fptou_v32f64_v32i8(<32 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
 ; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v23.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v19.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v16.2s, v16.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT:    uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT:    uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT:    uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v7.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v32f64_v32i8:
@@ -3028,9 +2954,8 @@ define <8 x i8> @fptos_v8f32_v8i8(<8 x float> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f32_v8i8:
@@ -3050,9 +2975,8 @@ define <8 x i8> @fptou_v8f32_v8i8(<8 x float> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f32_v8i8:
@@ -3074,12 +2998,8 @@ define <16 x i8> @fptos_v16f32_v16i8(<16 x float> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v3.4h, v3.4s
-; CHECK-SD-NEXT:    xtn v2.4h, v2.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3136,20 +3056,12 @@ define <32 x i8> @fptos_v32f32_v32i8(<32 x float> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v6.4s, v6.4s
 ; CHECK-SD-NEXT:    fcvtzs v5.4s, v5.4s
 ; CHECK-SD-NEXT:    fcvtzs v4.4s, v4.4s
-; CHECK-SD-NEXT:    xtn v3.4h, v3.4s
-; CHECK-SD-NEXT:    xtn v2.4h, v2.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    xtn v7.4h, v7.4s
-; CHECK-SD-NEXT:    xtn v6.4h, v6.4s
-; CHECK-SD-NEXT:    xtn v5.4h, v5.4s
-; CHECK-SD-NEXT:    xtn v4.4h, v4.4s
-; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    mov v4.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-SD-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f32_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
index 380bdbcc7f7408..611940546bc1a2 100644
--- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -9,9 +9,8 @@ define <8 x i8> @float_to_i8(ptr %in) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %l = load <8 x float>, ptr %in
   %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>

>From 0a08e0b10a01f69906669991eadf6d5f6dd26fab Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Wed, 21 Feb 2024 10:01:27 -0800
Subject: [PATCH 2/2] Check bitcast source type

Change-Id: Ib1018cdc616550d41a6cdb9bdde07654f0927f33
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb078ae499872e..ba56b0b308e198 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21073,10 +21073,13 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
     SDValue Op1 = N->getOperand(1);
     if (Op0.getOpcode() != ISD::BITCAST || Op1.getOpcode() != ISD::BITCAST)
       return false;
+    EVT Op0Ty = Op0.getOperand(0).getValueType();
+    if (Op0Ty != Op1.getOperand(0).getValueType())
+      return false;
 
     EVT ResVT = N->getValueType(0);
     return ResVT.widenIntegerVectorElementType(Ctx).getHalfNumVectorElementsVT(
-               Ctx) == Op0.getOperand(0).getValueType();
+               Ctx) == Op0Ty;
   };
 
   // truncating uzp1(x=uzp1, y=uzp1) -> trunc(concat (x, y))



More information about the llvm-commits mailing list