[llvm] [AArch64] Optimize two large shifts and a combine into a single combine and shift (PR #99480)

Wed Jul 24 09:54:09 PDT 2024

https://github.com/sopyb updated https://github.com/llvm/llvm-project/pull/99480

>From e98b4d5b7c8f6d518b28477b49a442e7446c9caa Mon Sep 17 00:00:00 2001
From: sopy <contact at sopy.one>
Date: Fri, 12 Jul 2024 18:30:13 +0300
Subject: [PATCH 01/11] Add test

---
 .../AArch64/optimise_combine_large_shifts.ll  | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll

diff --git a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
new file mode 100644
index 0000000000000..8ca7f78519aa4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+
+define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: combine16ix8
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #10
+  %3 = lshr <8 x i16> %0, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
+  %4 = lshr <8 x i16> %1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
+  %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = trunc nuw nsw <16 x i16> %5 to <16 x i8>
+  ret <16 x i8> %6
+}
+
+define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: combine32ix4
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #20
+  %3 = lshr <4 x i32> %0, <i32 20, i32 20, i32 20, i32 20>
+  %4 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
+  %5 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = trunc nuw nsw <8 x i32> %5 to <8 x i16>
+  ret <8 x i16> %6
+}
+
+define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: combine64ix2
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #40
+  %3 = lshr <2 x i64> %0, <i64 40, i64 40>
+  %4 = lshr <2 x i64> %1, <i64 40, i64 40>
+  %5 = shufflevector <2 x i64> %3, <2 x i64> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = trunc nuw nsw <4 x i64> %5 to <4 x i32>
+  ret <4 x i32> %6
+}
\ No newline at end of file

>From 49d67b62995d17e53cd1eeaad10513545e1b459e Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Tue, 16 Jul 2024 22:27:56 +0000
Subject: [PATCH 02/11] Fix test

---
 .../CodeGen/AArch64/optimise_combine_large_shifts.ll     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
index 8ca7f78519aa4..d513e7908e703 100644
--- a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
+++ b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
@@ -3,7 +3,8 @@
 define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine16ix8
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #10
+; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #2
+; CHECK-NEXT: ret
   %3 = lshr <8 x i16> %0, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %4 = lshr <8 x i16> %1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -14,7 +15,8 @@ define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef
 define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine32ix4
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #20
+; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #4
+; CHECK-NEXT: ret
   %3 = lshr <4 x i32> %0, <i32 20, i32 20, i32 20, i32 20>
   %4 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
   %5 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -25,7 +27,8 @@ define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef
 define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine64ix2
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #40
+; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #8
+; CHECK-NEXT: ret
   %3 = lshr <2 x i64> %0, <i64 40, i64 40>
   %4 = lshr <2 x i64> %1, <i64 40, i64 40>
   %5 = shufflevector <2 x i64> %3, <2 x i64> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

>From 534b12c54a03d078abc891d367d5d0db532d31fb Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Thu, 18 Jul 2024 10:22:14 +0000
Subject: [PATCH 03/11] Update neon-rshrn.ll to align with optimizations

---
 llvm/test/CodeGen/AArch64/neon-rshrn.ll | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
index 8d47f4afb355f..e648b10ea357b 100644
--- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll
+++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
@@ -110,11 +110,10 @@ define <16 x i8> @rshrn_v16i16_9(<16 x i16> %a) {
 ; CHECK-LABEL: rshrn_v16i16_9:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v2.8h, #1, lsl #8
-; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    add v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ushr v1.8h, v1.8h, #9
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #9
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #1
 ; CHECK-NEXT:    ret
 entry:
   %b = add <16 x i16> %a, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
@@ -336,11 +335,10 @@ define <8 x i16> @rshrn_v8i32_17(<8 x i32> %a) {
 ; CHECK-LABEL: rshrn_v8i32_17:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v2.4s, #1, lsl #16
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #17
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #17
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
 entry:
   %b = add <8 x i32> %a, <i32 65536, i32 65536, i32 65536, i32 65536, i32 65536, i32 65536, i32 65536, i32 65536>
@@ -771,11 +769,10 @@ define <4 x i32> @rshrn_v4i64_33(<4 x i64> %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, #4294967296 // =0x100000000
 ; CHECK-NEXT:    dup v2.2d, x8
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    ushr v1.2d, v1.2d, #33
-; CHECK-NEXT:    ushr v0.2d, v0.2d, #33
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
 ; CHECK-NEXT:    ret
 entry:
   %b = add <4 x i64> %a, <i64 4294967296, i64 4294967296, i64 4294967296, i64 4294967296>

>From c996bc98a73c073cd6ac1c8df4a1d3c348816b2e Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Thu, 18 Jul 2024 10:41:59 +0000
Subject: [PATCH 04/11] Lowering

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eef83a845e2c3..c0cbf95f7198b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1849,7 +1849,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::SHL, VT, Custom);
   setOperationAction(ISD::OR, VT, Custom);
   setOperationAction(ISD::SETCC, VT, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
 
   setOperationAction(ISD::SELECT, VT, Expand);
   setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -14198,9 +14198,43 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                    !Subtarget->isNeonAvailable()))
     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
 
-  assert(Op.getValueType().isScalableVector() &&
-         isTypeLegal(Op.getValueType()) &&
-         "Expected legal scalable vector type!");
+  if (!Op.getValueType().isScalableVector()) {
+    const SDValue Trunc1 = Op.getOperand(0);
+    const SDValue Trunc2 = Op.getOperand(1);
+
+    if (Trunc1->getOpcode() == ISD::TRUNCATE &&
+        Trunc2->getOpcode() == ISD::TRUNCATE) {
+      const SDValue Shift1 = Trunc1->getOperand(0);
+      const SDValue Shift2 = Trunc2->getOperand(0);
+
+      // check for VLSHR with the same shift amount
+      if (Shift1->getOpcode() == AArch64ISD::VLSHR &&
+          Shift2->getOpcode() == AArch64ISD::VLSHR &&
+          Shift1->getOperand(1) == Shift2->getOperand(1)) {
+
+        const SDValue Vector1 = Shift1->getOperand(0);
+        const SDValue Vector2 = Shift2->getOperand(0);
+
+        const uint64_t ShiftConstant = Shift1->getConstantOperandVal(1);
+        const uint64_t PostUzpsize = Op.getScalarValueSizeInBits();
+
+        // Check if the shift constant is greater than the scalar value size
+        if (ShiftConstant > PostUzpsize) {
+          const EVT VT = Op.getValueType();
+          const SDLoc DL(Op);
+
+          const SDValue Uzp =
+              DAG.getNode(AArch64ISD::UZP2, DL, VT, Vector1, Vector2);
+          const SDValue NewShiftConstant =
+              DAG.getConstant(ShiftConstant - PostUzpsize, DL, MVT::i32);
+
+          return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
+        }
+      }
+    }
+
+    return Op;
+  }
 
   if (isTypeLegal(Op.getOperand(0).getValueType())) {
     unsigned NumOperands = Op->getNumOperands();

>From 2c2d06ad42c4514c713d5c7d2ed1210290118f3b Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Mon, 22 Jul 2024 19:43:09 +0000
Subject: [PATCH 05/11] Moving logic to performConcatVectorsCombine

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 99 ++++++++++---------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c0cbf95f7198b..d2db216984fa7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1849,7 +1849,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::SHL, VT, Custom);
   setOperationAction(ISD::OR, VT, Custom);
   setOperationAction(ISD::SETCC, VT, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
 
   setOperationAction(ISD::SELECT, VT, Expand);
   setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -14198,43 +14198,9 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                    !Subtarget->isNeonAvailable()))
     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
 
-  if (!Op.getValueType().isScalableVector()) {
-    const SDValue Trunc1 = Op.getOperand(0);
-    const SDValue Trunc2 = Op.getOperand(1);
-
-    if (Trunc1->getOpcode() == ISD::TRUNCATE &&
-        Trunc2->getOpcode() == ISD::TRUNCATE) {
-      const SDValue Shift1 = Trunc1->getOperand(0);
-      const SDValue Shift2 = Trunc2->getOperand(0);
-
-      // check for VLSHR with the same shift amount
-      if (Shift1->getOpcode() == AArch64ISD::VLSHR &&
-          Shift2->getOpcode() == AArch64ISD::VLSHR &&
-          Shift1->getOperand(1) == Shift2->getOperand(1)) {
-
-        const SDValue Vector1 = Shift1->getOperand(0);
-        const SDValue Vector2 = Shift2->getOperand(0);
-
-        const uint64_t ShiftConstant = Shift1->getConstantOperandVal(1);
-        const uint64_t PostUzpsize = Op.getScalarValueSizeInBits();
-
-        // Check if the shift constant is greater than the scalar value size
-        if (ShiftConstant > PostUzpsize) {
-          const EVT VT = Op.getValueType();
-          const SDLoc DL(Op);
-
-          const SDValue Uzp =
-              DAG.getNode(AArch64ISD::UZP2, DL, VT, Vector1, Vector2);
-          const SDValue NewShiftConstant =
-              DAG.getConstant(ShiftConstant - PostUzpsize, DL, MVT::i32);
-
-          return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
-        }
-      }
-    }
-
-    return Op;
-  }
+  assert(Op.getValueType().isScalableVector() &&
+         isTypeLegal(Op.getValueType()) &&
+         "Expected legal scalable vector type!");
 
   if (isTypeLegal(Op.getOperand(0).getValueType())) {
     unsigned NumOperands = Op->getNumOperands();
@@ -19152,23 +19118,24 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   if (VT.isScalableVector())
     return SDValue();
 
-  // Optimize concat_vectors of truncated vectors, where the intermediate
-  // type is illegal, to avoid said illegality,  e.g.,
-  //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
-  //                          (v2i16 (truncate (v2i64)))))
-  // ->
-  //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
-  //                                    (v4i32 (bitcast (v2i64))),
-  //                                    <0, 2, 4, 6>)))
-  // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
-  // on both input and result type, so we might generate worse code.
-  // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
   if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
       N1Opc == ISD::TRUNCATE) {
     SDValue N00 = N0->getOperand(0);
     SDValue N10 = N1->getOperand(0);
     EVT N00VT = N00.getValueType();
+    unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
 
+    // Optimize concat_vectors of truncated vectors, where the intermediate
+    // type is illegal, to avoid said illegality,  e.g.,
+    //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
+    //                          (v2i16 (truncate (v2i64)))))
+    // ->
+    //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
+    //                                    (v4i32 (bitcast (v2i64))),
+    //                                    <0, 2, 4, 6>)))
+    // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
+    // on both input and result type, so we might generate worse code.
+    // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
     if (N00VT == N10.getValueType() &&
         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
@@ -19182,6 +19149,40 @@ static SDValue performConcatVectorsCombine(SDNode *N,
                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
     }
+
+    // Optimize two large shifts and a combine into a single combine and shift
+    // For AArch64 architectures, sequences like the following:
+    //
+    //     ushr    v0.4s, v0.4s, #20
+    //     ushr    v1.4s, v1.4s, #20
+    //     uzp1    v0.8h, v0.8h, v1.8h
+    //
+    // Can be optimized to:
+    //
+    //     uzp2    v0.8h, v0.8h, v1.8h
+    //     ushr    v0.8h, v0.8h, #4
+    //
+    // This optimization reduces instruction count.
+    if (N00Opc == AArch64ISD::VLSHR &&
+        N10Opc == AArch64ISD::VLSHR &&
+        N00->getOperand(1) == N10->getOperand(1)) {
+
+      SDValue N000 = N00->getOperand(0);
+      SDValue N100 = N10->getOperand(0);
+      uint64_t N001ConstVal = N00->getConstantOperandVal(1),
+               N101ConstVal = N10->getConstantOperandVal(1),
+               NScalarSize = N->getValueType(0).getScalarSizeInBits();
+
+      if (N001ConstVal == N101ConstVal &&
+          N001ConstVal > NScalarSize) {
+
+        SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
+        SDValue NewShiftConstant =
+            DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
+
+        return DAG.getNode(AArch64ISD::VLSHR, dl, VT, Uzp, NewShiftConstant);
+      }
+    }
   }
 
   if (N->getOperand(0).getValueType() == MVT::v4i8 ||

>From 9ee70f8d79a3320f1d064c2afb803f13f6b1848c Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Mon, 22 Jul 2024 19:52:11 +0000
Subject: [PATCH 06/11] optimise_combine_large_shifts.ll ->
 optimize_combine_large_shifts.ll

---
 ...e_combine_large_shifts.ll => optimize_combine_large_shifts.ll} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/AArch64/{optimise_combine_large_shifts.ll => optimize_combine_large_shifts.ll} (100%)

diff --git a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
similarity index 100%
rename from llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
rename to llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll

>From 7b777798e7d7b1258dd1047a56a469fac4ef4957 Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Mon, 22 Jul 2024 20:10:01 +0000
Subject: [PATCH 07/11] cleanup tests

---
 .../test/CodeGen/AArch64/optimize_combine_large_shifts.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
index d513e7908e703..a37b072c5eb04 100644
--- a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
+++ b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+; RUN: update_llc_test_checks < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
 
-define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
+define <16 x i8> @combine16ix8(<8 x i16> %0, <8 x i16> %1) {
 ; CHECK-LABEL: combine16ix8
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #2
@@ -12,7 +12,7 @@ define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef
   ret <16 x i8> %6
 }
 
-define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
+define <8 x i16> @combine32ix4(<4 x i32> %0, <4 x i32> %1) {
 ; CHECK-LABEL: combine32ix4
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #4
@@ -24,7 +24,7 @@ define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef
   ret <8 x i16> %6
 }
 
-define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
+define <4 x i32> @combine64ix2(<2 x i64> %0, <2 x i64> %1) {
 ; CHECK-LABEL: combine64ix2
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #8

>From f0f0dc707fbd1864996b2e7b50b3730d128d6b80 Mon Sep 17 00:00:00 2001
From: sopy <contact at sopy.one>
Date: Mon, 22 Jul 2024 23:20:32 +0300
Subject: [PATCH 08/11] Clang-format

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d2db216984fa7..2cbfc93c2019a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19163,8 +19163,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
     //     ushr    v0.8h, v0.8h, #4
     //
     // This optimization reduces instruction count.
-    if (N00Opc == AArch64ISD::VLSHR &&
-        N10Opc == AArch64ISD::VLSHR &&
+    if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
         N00->getOperand(1) == N10->getOperand(1)) {
 
       SDValue N000 = N00->getOperand(0);
@@ -19173,8 +19172,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
                N101ConstVal = N10->getConstantOperandVal(1),
                NScalarSize = N->getValueType(0).getScalarSizeInBits();
 
-      if (N001ConstVal == N101ConstVal &&
-          N001ConstVal > NScalarSize) {
+      if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
 
         SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
         SDValue NewShiftConstant =

>From 0969b27090789e3022f36c773637de306892ad90 Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Tue, 23 Jul 2024 07:11:15 +0000
Subject: [PATCH 09/11] Revert "cleanup tests"

This reverts commit 7b777798e7d7b1258dd1047a56a469fac4ef4957.
---
 .../test/CodeGen/AArch64/optimize_combine_large_shifts.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
index a37b072c5eb04..d513e7908e703 100644
--- a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
+++ b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
@@ -1,6 +1,6 @@
-; RUN: update_llc_test_checks < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
 
-define <16 x i8> @combine16ix8(<8 x i16> %0, <8 x i16> %1) {
+define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine16ix8
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #2
@@ -12,7 +12,7 @@ define <16 x i8> @combine16ix8(<8 x i16> %0, <8 x i16> %1) {
   ret <16 x i8> %6
 }
 
-define <8 x i16> @combine32ix4(<4 x i32> %0, <4 x i32> %1) {
+define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine32ix4
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #4
@@ -24,7 +24,7 @@ define <8 x i16> @combine32ix4(<4 x i32> %0, <4 x i32> %1) {
   ret <8 x i16> %6
 }
 
-define <4 x i32> @combine64ix2(<2 x i64> %0, <2 x i64> %1) {
+define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine64ix2
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #8

>From 36314a6a045f1a4fca93261a481ea992afbe8fb5 Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Tue, 23 Jul 2024 07:12:07 +0000
Subject: [PATCH 10/11] cleanup tests

---
 llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
index d513e7908e703..dbb0235200ac7 100644
--- a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
+++ b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
 
-define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
+define <16 x i8> @combine16ix8(<8 x i16> %0, <8 x i16> %1) {
 ; CHECK-LABEL: combine16ix8
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #2
@@ -12,7 +12,7 @@ define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef
   ret <16 x i8> %6
 }
 
-define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
+define <8 x i16> @combine32ix4(<4 x i32> %0, <4 x i32> %1) {
 ; CHECK-LABEL: combine32ix4
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #4
@@ -24,7 +24,7 @@ define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef
   ret <8 x i16> %6
 }
 
-define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
+define <4 x i32> @combine64ix2(<2 x i64> %0, <2 x i64> %1) {
 ; CHECK-LABEL: combine64ix2
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #8

>From 95dc6b6327ec4662c8ab32a5c97076c463600904 Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Wed, 24 Jul 2024 16:53:49 +0000
Subject: [PATCH 11/11] Using update_llc_test_checks for regression tests

---
 .../AArch64/optimize_combine_large_shifts.ll  | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
index dbb0235200ac7..f75b6ac0deb6c 100644
--- a/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
+++ b/llvm/test/CodeGen/AArch64/optimize_combine_large_shifts.ll
@@ -1,10 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
 
 define <16 x i8> @combine16ix8(<8 x i16> %0, <8 x i16> %1) {
-; CHECK-LABEL: combine16ix8
-; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #2
-; CHECK-NEXT: ret
+; CHECK-LABEL: combine16ix8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #2
+; CHECK-NEXT:    ret
   %3 = lshr <8 x i16> %0, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %4 = lshr <8 x i16> %1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -13,10 +15,11 @@ define <16 x i8> @combine16ix8(<8 x i16> %0, <8 x i16> %1) {
 }
 
 define <8 x i16> @combine32ix4(<4 x i32> %0, <4 x i32> %1) {
-; CHECK-LABEL: combine32ix4
-; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #4
-; CHECK-NEXT: ret
+; CHECK-LABEL: combine32ix4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #4
+; CHECK-NEXT:    ret
   %3 = lshr <4 x i32> %0, <i32 20, i32 20, i32 20, i32 20>
   %4 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
   %5 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -25,13 +28,14 @@ define <8 x i16> @combine32ix4(<4 x i32> %0, <4 x i32> %1) {
 }
 
 define <4 x i32> @combine64ix2(<2 x i64> %0, <2 x i64> %1) {
-; CHECK-LABEL: combine64ix2
-; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #8
-; CHECK-NEXT: ret
+; CHECK-LABEL: combine64ix2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #8
+; CHECK-NEXT:    ret
   %3 = lshr <2 x i64> %0, <i64 40, i64 40>
   %4 = lshr <2 x i64> %1, <i64 40, i64 40>
   %5 = shufflevector <2 x i64> %3, <2 x i64> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = trunc nuw nsw <4 x i64> %5 to <4 x i32>
   ret <4 x i32> %6
-}
\ No newline at end of file
+}