[llvm] e348b09 - [AArch64] Turn UZP1 with undef operand into truncate

Fri Mar 4 03:12:30 PST 2022

Author: David Green
Date: 2022-03-04T11:12:26Z
New Revision: e348b09bb5b1c21f6af17e3390807947a049f37e

URL: https://github.com/llvm/llvm-project/commit/e348b09bb5b1c21f6af17e3390807947a049f37e
DIFF: https://github.com/llvm/llvm-project/commit/e348b09bb5b1c21f6af17e3390807947a049f37e.diff

LOG: [AArch64] Turn UZP1 with undef operand into truncate

This turns upz1(x, undef) to concat(truncate(x), undef), as the truncate
is simpler and can often be optimized away, and it helps some of the
insert-subvector tests optimize more cleanly.

Differential Revision: https://reviews.llvm.org/D120879

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/insert-subvector.ll
    llvm/test/CodeGen/AArch64/neon-perm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72756ea12ca83..0ef441b12eca2 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16179,6 +16179,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue Op1 = N->getOperand(1);
   EVT ResVT = N->getValueType(0);
 
+  // uzp1(x, undef) -> concat(truncate(x), undef)
+  if (Op1.getOpcode() == ISD::UNDEF) {
+    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+    switch (ResVT.getSimpleVT().SimpleTy) {
+    default:
+      break;
+    case MVT::v16i8:
+      BCVT = MVT::v8i16;
+      HalfVT = MVT::v8i8;
+      break;
+    case MVT::v8i16:
+      BCVT = MVT::v4i32;
+      HalfVT = MVT::v4i16;
+      break;
+    case MVT::v4i32:
+      BCVT = MVT::v2i64;
+      HalfVT = MVT::v2i32;
+      break;
+    }
+    if (BCVT != MVT::Other) {
+      SDValue BC = DAG.getBitcast(BCVT, Op0);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+                         DAG.getUNDEF(HalfVT));
+    }
+  }
+
   // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
   if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
     if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {

diff  --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index f5ae348c161f5..e18999c892a9c 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -260,11 +260,9 @@ define <4 x i32> @insert_v4i32_2_2(float %tmp, <4 x i32> %b, <4 x i32> %a) {
 define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 ; CHECK-LABEL: load_v16i8_4_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <4 x i8>, <4 x i8> *%a
   %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -275,11 +273,9 @@ define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 ; CHECK-LABEL: load_v16i8_4_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
-; CHECK-NEXT:    uzp1 v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
@@ -292,11 +288,9 @@ define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 ; CHECK-LABEL: load_v16i8_4_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <4 x i8>, <4 x i8> *%a
   %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -307,11 +301,9 @@ define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 ; CHECK-LABEL: load_v16i8_4_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <4 x i8>, <4 x i8> *%a
   %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -322,11 +314,9 @@ define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 define <16 x i8> @load_v16i8_4_4(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
 ; CHECK-LABEL: load_v16i8_4_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[3], v2.s[0]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <4 x i8>, <4 x i8> *%a
   %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -399,11 +389,11 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
+; CHECK-NEXT:    xtn v1.4h, v2.4s
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, <2 x i16> *%a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -420,9 +410,9 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
 ; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
 ; CHECK-NEXT:    adrp x8, .LCPI33_0
-; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI33_0]
+; CHECK-NEXT:    xtn v0.4h, v2.4s
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v3.16b
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, <2 x i16> *%a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -435,11 +425,11 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
+; CHECK-NEXT:    xtn v1.4h, v2.4s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, <2 x i16> *%a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -452,11 +442,11 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
+; CHECK-NEXT:    xtn v1.4h, v2.4s
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, <2 x i16> *%a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -469,11 +459,11 @@ define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[3], v2.s[0]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
+; CHECK-NEXT:    xtn v1.4h, v2.4s
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, <2 x i16> *%a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

diff  --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 3b2030ea65329..26ffa2727a1cd 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -2203,7 +2203,7 @@ entry:
 define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_s8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -2223,7 +2223,7 @@ entry:
 define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -2233,7 +2233,7 @@ entry:
 define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -2253,7 +2253,7 @@ entry:
 define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_u8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -2273,7 +2273,7 @@ entry:
 define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_u16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -2283,7 +2283,7 @@ entry:
 define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_u32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -2313,7 +2313,7 @@ entry:
 define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_p8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -2333,7 +2333,7 @@ entry:
 define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
 ; CHECK-LABEL: test_undef_vuzp1q_p16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>