[llvm] [AArch64] Remove copy instruction between uaddlv with v8i16 and dup (PR #66068)

Tue Sep 12 04:11:24 PDT 2023

https://github.com/jaykang10 created https://github.com/llvm/llvm-project/pull/66068:

If there are copy instructions between uaddlv with v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane.
It is a follow-up patch of https://reviews.llvm.org/D159267

>From 62253e16f8351bc2974e9d0a5f1b462432b8c9c8 Mon Sep 17 00:00:00 2001
From: Jingu Kang <jingu.kang at arm.com>
Date: Tue, 5 Sep 2023 16:56:00 +0100
Subject: [PATCH] [AArch64] Remove copy instruction between uaddlv with v8i16
 and dup

If there are copy instructions between uaddlv with v8i16 and dup for transfer
from gpr to fpr, try to remove them with duplane.
It is a follow-up patch of https://reviews.llvm.org/D159267
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  3 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  8 ++++++++
 .../aarch64-neon-vector-insert-uaddlv.ll      | 12 ++++++------
 llvm/test/CodeGen/AArch64/neon-addlv.ll       | 19 ++++++++++++++++++-
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fb25dd1d77553fb..510cf33e7007029 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5329,7 +5329,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_neon_uaddlv: {
     EVT OpVT = Op.getOperand(1).getValueType();
     EVT ResVT = Op.getValueType();
-    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+    if (ResVT == MVT::i32 &&
+        (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || OpVT == MVT::v8i16)) {
       // In order to avoid insert_subvector, used v4i32 than v2i32.
       SDValue UADDLV =
           DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 82b79cd7232cc90..173b6dd67d20c88 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6077,6 +6077,8 @@ defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
 defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
 defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
+defm : DUPWithTruncPats<v4i32,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
 multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
                                SDNodeXForm IdxXFORM> {
   def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
@@ -6472,12 +6474,18 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
             (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
             ssub))>;
 
+def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
+
 def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
 
 def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
 
+def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
+
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
 // In effect, opNode is the same as (scalar_to_vector (IntNode)).
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index bf420700eb575fb..ccc7a2d9ebac4c2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    uaddlv.8h s0, v0
 ; CHECK-NEXT:    mov.s v1[0], v0[0]
-; CHECK-NEXT:    ucvtf.2s v1, v1
-; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    ucvtf.2s v0, v1
+; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    uaddlv.8h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    st1.s { v0 }[2], [x8]
 ; CHECK-NEXT:    str d0, [x0, #80]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    str q2, [x0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 1b037c13aa4b546..7cc1d2e6647ab96 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -194,8 +194,23 @@ entry:
   ret <8 x i8> %vrshrn_n2
 }
 
-declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+define dso_local <8 x i16> @uaddlv_dup_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: uaddlv_dup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    dup v1.8h, v0.h[0]
+; CHECK-NEXT:    rshrn v0.4h, v1.4s, #3
+; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #3
+; CHECK-NEXT:    ret
+entry:
+  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
+  %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
+  %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)
+  ret <8 x i16> %vrshrn_n2
+}
 
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
 declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)
 
 define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) {
@@ -215,3 +230,5 @@ entry:
   %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer
   ret <8 x i8> %vecinit7.i
 }
+declare <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32>, i32)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)