[llvm] [AArch64] Remove copy instruction between uaddlv with v4i16/v8i16 and dup (PR #66508)

Fri Sep 15 06:16:25 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64
            
<details>
<summary>Changes</summary>
If there are copy instructions between uaddlv with v4i16/v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane. It is a follow-up patch of https://reviews.llvm.org/D159267
--
Full diff: https://github.com/llvm/llvm-project/pull/66508.diff

5 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+23-3) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+9) 
- (modified) llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll (+9-9) 
- (modified) llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll (+7-8) 
- (modified) llvm/test/CodeGen/AArch64/neon-addlv.ll (+33-1) 


<pre>

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 30a66c7ae4be845..72f9785a2aee655 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5330,7 +5330,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_neon_uaddlv: {
     EVT OpVT = Op.getOperand(1).getValueType();
     EVT ResVT = Op.getValueType();
-    if (ResVT == MVT::i32 &amp;&amp; (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+    if (ResVT == MVT::i32 &amp;&amp; (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
+                              OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
       // In order to avoid insert_subvector, used v4i32 than v2i32.
       SDValue UADDLV =
           DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
@@ -22286,6 +22287,7 @@ static SDValue performSelectCombine(SDNode *N,
 static SDValue performDUPCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &amp;DCI) {
   EVT VT = N-&gt;getValueType(0);
+  SDLoc DL(N);
   // If &quot;v2i32 DUP(x)&quot; and &quot;v4i32 DUP(x)&quot; both exist, use an extract from the
   // 128bit vector version.
   if (VT.is64BitVector() &amp;&amp; DCI.isAfterLegalizeDAG()) {
@@ -22293,14 +22295,32 @@ static SDValue performDUPCombine(SDNode *N,
     SmallVector&lt;SDValue&gt; Ops(N-&gt;ops());
     if (SDNode *LN = DCI.DAG.getNodeIfExists(N-&gt;getOpcode(),
                                              DCI.DAG.getVTList(LVT), Ops)) {
-      SDLoc DL(N);
       return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
                              DCI.DAG.getConstant(0, DL, MVT::i64));
     }
   }
 
-  if (N-&gt;getOpcode() == AArch64ISD::DUP)
+  if (N-&gt;getOpcode() == AArch64ISD::DUP) {
+    if (DCI.isAfterLegalizeDAG()) {
+      // If scalar dup&#x27;s operand is extract_vector_elt, try to combine them into
+      // duplane. For example,
+      //
+      //    t21: i32 = extract_vector_elt t19, Constant:i64&lt;0&gt;
+      //  t18: v4i32 = AArch64ISD::DUP t21
+      //  ==&gt;
+      //  t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64&lt;0&gt;
+      SDValue EXTRACT_VEC_ELT = N-&gt;getOperand(0);
+      if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+        if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
+          unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+          return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
+                                 EXTRACT_VEC_ELT.getOperand(1));
+        }
+      }
+    }
+
     return performPostLD1Combine(N, DCI, false);
+  }
 
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 82b79cd7232cc90..8a7b18ed46ca32f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6472,12 +6472,21 @@ def : Pat&lt;(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
             (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
             ssub))&gt;;
 
+def : Pat&lt;(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))&gt;;
+
 def : Pat&lt;(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))&gt;;
 
+def : Pat&lt;(v4i32 (AArch64uaddlv (v4i16 V64:$Rn))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$Rn), ssub))&gt;;
+
 def : Pat&lt;(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))&gt;;
 
+def : Pat&lt;(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))&gt;;
+
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
 // In effect, opNode is the same as (scalar_to_vector (IntNode)).
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index bf420700eb575fb..55750ab34e17a03 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    uaddlv.8h s0, v0
 ; CHECK-NEXT:    mov.s v1[0], v0[0]
-; CHECK-NEXT:    ucvtf.2s v1, v1
-; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    ucvtf.2s v0, v1
+; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    uaddlv.8h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    st1.s { v0 }[2], [x8]
 ; CHECK-NEXT:    str d0, [x0, #80]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    str q2, [x0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -256,9 +256,9 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
 ; CHECK-NEXT:    uaddlv.4h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v2, v2
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ucvtf.2d v1, v2
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
diff --git a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
index 3b064b718cd679d..20adcdf2956d691 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
@@ -9,16 +9,15 @@ define i32 @widget(i64 %arg, &lt;8 x i16&gt; %arg1) {
 ; CHECK:       // %bb.0: // %bb
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    bfi x10, x0, #1, #3
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    bfi x9, x0, #1, #3
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    dup v1.8h, w9
-; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    ld1 { v1.h }[1], [x10]
-; CHECK-NEXT:    str q1, [x8]
+; CHECK-NEXT:    str q1, [sp]
+; CHECK-NEXT:    ld1 { v0.h }[1], [x9]
+; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 bb:
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 1b037c13aa4b546..0241091fae02542 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -195,7 +195,6 @@ entry:
 }
 
 declare &lt;8 x i8&gt; @llvm.aarch64.neon.rshrn.v8i8(&lt;8 x i16&gt;, i32)
-
 declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)
 
 define &lt;8 x i8&gt; @uaddlv_v8i8_urshr(&lt;8 x i8&gt; %a) {
@@ -215,3 +214,36 @@ entry:
   %vecinit7.i = shufflevector &lt;8 x i8&gt; %vecinit.i, &lt;8 x i8&gt; poison, &lt;8 x i32&gt; zeroinitializer
   ret &lt;8 x i8&gt; %vecinit7.i
 }
+
+define &lt;4 x i32&gt; @uaddlv_dup_v4i16(&lt;4 x i16&gt; %a) {
+; CHECK-LABEL: uaddlv_dup_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.4h
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #3
+; CHECK-NEXT:    ret
+entry:
+  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(&lt;4 x i16&gt; %a)
+  %vecinit.i = insertelement &lt;4 x i32&gt; undef, i32 %vaddlv.i, i64 0
+  %vecinit7.i = shufflevector &lt;4 x i32&gt; %vecinit.i, &lt;4 x i32&gt; poison, &lt;4 x i32&gt; zeroinitializer
+  %vshr_n = lshr &lt;4 x i32&gt; %vecinit7.i, &lt;i32 3, i32 3, i32 3, i32 3&gt;
+  ret &lt;4 x i32&gt; %vshr_n
+}
+
+define &lt;4 x i32&gt; @uaddlv_dup_v8i16(&lt;8 x i16&gt; %a) {
+; CHECK-LABEL: uaddlv_dup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #3
+; CHECK-NEXT:    ret
+entry:
+  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(&lt;8 x i16&gt; %a)
+  %vecinit.i = insertelement &lt;4 x i32&gt; undef, i32 %vaddlv.i, i64 0
+  %vecinit7.i = shufflevector &lt;4 x i32&gt; %vecinit.i, &lt;4 x i32&gt; poison, &lt;4 x i32&gt; zeroinitializer
+  %vshr_n = lshr &lt;4 x i32&gt; %vecinit7.i, &lt;i32 3, i32 3, i32 3, i32 3&gt;
+  ret &lt;4 x i32&gt; %vshr_n
+}
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(&lt;8 x i16&gt;)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(&lt;4 x i16&gt;)
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66508