[llvm] [AArch64] Optimize two large shifts and a combine into a single combine and shift (PR #99480)

Thu Jul 18 05:18:45 PDT 2024

https://github.com/sopyb created https://github.com/llvm/llvm-project/pull/99480

Addresses a missing optimization in the AArch64 back-end where two large shifts followed by a combine operation can be optimized into a single combine operation followed by a shift. 

Closes #59502

>From e98b4d5b7c8f6d518b28477b49a442e7446c9caa Mon Sep 17 00:00:00 2001
From: sopy <contact at sopy.one>
Date: Fri, 12 Jul 2024 18:30:13 +0300
Subject: [PATCH 1/4] Add test

---
 .../AArch64/optimise_combine_large_shifts.ll  | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll

diff --git a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
new file mode 100644
index 0000000000000..8ca7f78519aa4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+
+define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: combine16ix8
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #10
+  %3 = lshr <8 x i16> %0, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
+  %4 = lshr <8 x i16> %1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
+  %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = trunc nuw nsw <16 x i16> %5 to <16 x i8>
+  ret <16 x i8> %6
+}
+
+define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: combine32ix4
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #20
+  %3 = lshr <4 x i32> %0, <i32 20, i32 20, i32 20, i32 20>
+  %4 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
+  %5 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = trunc nuw nsw <8 x i32> %5 to <8 x i16>
+  ret <8 x i16> %6
+}
+
+define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: combine64ix2
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #40
+  %3 = lshr <2 x i64> %0, <i64 40, i64 40>
+  %4 = lshr <2 x i64> %1, <i64 40, i64 40>
+  %5 = shufflevector <2 x i64> %3, <2 x i64> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = trunc nuw nsw <4 x i64> %5 to <4 x i32>
+  ret <4 x i32> %6
+}
\ No newline at end of file

>From 49d67b62995d17e53cd1eeaad10513545e1b459e Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Tue, 16 Jul 2024 22:27:56 +0000
Subject: [PATCH 2/4] Fix test

---
 .../CodeGen/AArch64/optimise_combine_large_shifts.ll     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
index 8ca7f78519aa4..d513e7908e703 100644
--- a/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
+++ b/llvm/test/CodeGen/AArch64/optimise_combine_large_shifts.ll
@@ -3,7 +3,8 @@
 define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine16ix8
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #10
+; CHECK-NEXT: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #2
+; CHECK-NEXT: ret
   %3 = lshr <8 x i16> %0, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %4 = lshr <8 x i16> %1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -14,7 +15,8 @@ define dso_local <16 x i8> @combine16ix8(<8 x i16> noundef %0, <8 x i16> noundef
 define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine32ix4
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #20
+; CHECK-NEXT: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #4
+; CHECK-NEXT: ret
   %3 = lshr <4 x i32> %0, <i32 20, i32 20, i32 20, i32 20>
   %4 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
   %5 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -25,7 +27,8 @@ define dso_local <8 x i16> @combine32ix4(<4 x i32> noundef %0, <4 x i32> noundef
 define dso_local <4 x i32> @combine64ix2(<2 x i64> noundef %0, <2 x i64> noundef %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: combine64ix2
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #40
+; CHECK-NEXT: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #8
+; CHECK-NEXT: ret
   %3 = lshr <2 x i64> %0, <i64 40, i64 40>
   %4 = lshr <2 x i64> %1, <i64 40, i64 40>
   %5 = shufflevector <2 x i64> %3, <2 x i64> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

>From 534b12c54a03d078abc891d367d5d0db532d31fb Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Thu, 18 Jul 2024 10:22:14 +0000
Subject: [PATCH 3/4] Update neon-rshrn.ll to align with optimizations

---
 llvm/test/CodeGen/AArch64/neon-rshrn.ll | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
index 8d47f4afb355f..e648b10ea357b 100644
--- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll
+++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
@@ -110,11 +110,10 @@ define <16 x i8> @rshrn_v16i16_9(<16 x i16> %a) {
 ; CHECK-LABEL: rshrn_v16i16_9:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v2.8h, #1, lsl #8
-; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    add v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ushr v1.8h, v1.8h, #9
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #9
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #1
 ; CHECK-NEXT:    ret
 entry:
   %b = add <16 x i16> %a, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
@@ -336,11 +335,10 @@ define <8 x i16> @rshrn_v8i32_17(<8 x i32> %a) {
 ; CHECK-LABEL: rshrn_v8i32_17:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v2.4s, #1, lsl #16
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #17
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #17
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
 entry:
   %b = add <8 x i32> %a, <i32 65536, i32 65536, i32 65536, i32 65536, i32 65536, i32 65536, i32 65536, i32 65536>
@@ -771,11 +769,10 @@ define <4 x i32> @rshrn_v4i64_33(<4 x i64> %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, #4294967296 // =0x100000000
 ; CHECK-NEXT:    dup v2.2d, x8
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    ushr v1.2d, v1.2d, #33
-; CHECK-NEXT:    ushr v0.2d, v0.2d, #33
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
 ; CHECK-NEXT:    ret
 entry:
   %b = add <4 x i64> %a, <i64 4294967296, i64 4294967296, i64 4294967296, i64 4294967296>

>From c996bc98a73c073cd6ac1c8df4a1d3c348816b2e Mon Sep 17 00:00:00 2001
From: sopy <doimpt at sopy.one>
Date: Thu, 18 Jul 2024 10:41:59 +0000
Subject: [PATCH 4/4] Lowering

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eef83a845e2c3..c0cbf95f7198b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1849,7 +1849,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::SHL, VT, Custom);
   setOperationAction(ISD::OR, VT, Custom);
   setOperationAction(ISD::SETCC, VT, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
 
   setOperationAction(ISD::SELECT, VT, Expand);
   setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -14198,9 +14198,43 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                    !Subtarget->isNeonAvailable()))
     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
 
-  assert(Op.getValueType().isScalableVector() &&
-         isTypeLegal(Op.getValueType()) &&
-         "Expected legal scalable vector type!");
+  if (!Op.getValueType().isScalableVector()) {
+    const SDValue Trunc1 = Op.getOperand(0);
+    const SDValue Trunc2 = Op.getOperand(1);
+
+    if (Trunc1->getOpcode() == ISD::TRUNCATE &&
+        Trunc2->getOpcode() == ISD::TRUNCATE) {
+      const SDValue Shift1 = Trunc1->getOperand(0);
+      const SDValue Shift2 = Trunc2->getOperand(0);
+
+      // check for VLSHR with the same shift amount
+      if (Shift1->getOpcode() == AArch64ISD::VLSHR &&
+          Shift2->getOpcode() == AArch64ISD::VLSHR &&
+          Shift1->getOperand(1) == Shift2->getOperand(1)) {
+
+        const SDValue Vector1 = Shift1->getOperand(0);
+        const SDValue Vector2 = Shift2->getOperand(0);
+
+        const uint64_t ShiftConstant = Shift1->getConstantOperandVal(1);
+        const uint64_t PostUzpsize = Op.getScalarValueSizeInBits();
+
+        // Check if the shift constant is greater than the scalar value size
+        if (ShiftConstant > PostUzpsize) {
+          const EVT VT = Op.getValueType();
+          const SDLoc DL(Op);
+
+          const SDValue Uzp =
+              DAG.getNode(AArch64ISD::UZP2, DL, VT, Vector1, Vector2);
+          const SDValue NewShiftConstant =
+              DAG.getConstant(ShiftConstant - PostUzpsize, DL, MVT::i32);
+
+          return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
+        }
+      }
+    }
+
+    return Op;
+  }
 
   if (isTypeLegal(Op.getOperand(0).getValueType())) {
     unsigned NumOperands = Op->getNumOperands();