[llvm] [AArch64] Don't generate st2 for 64bit store that can use stp (PR #69901)

Mon Oct 23 00:40:01 PDT 2023

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/69901

>From 8bca87a996e71ad87c077a11d682e39ec851058e Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 23 Oct 2023 08:32:19 +0100
Subject: [PATCH] [AArch64] Don't generate st2 for 64bit store that can use stp

D142966 made it so that st2 that do not start at element 0 use zip2 instead of
st2. This extends that to any 64bit store that has a nearby load that can
better become a LDP operation, which is expected to have a higher throughput.
It searches up to 20 instructions away for a store to p+16 or p-16.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 34 ++++++++++-
 .../AArch64/machine-cse-profitable-check.ll   | 28 +++------
 llvm/test/CodeGen/AArch64/vldn_shuffle.ll     | 57 +++++++++----------
 3 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a16a102e472e709..bff4392b7de4d94 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15236,6 +15236,29 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   return true;
 }
 
+template <typename Iter>
+bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
+  int MaxLookupDist = 20;
+  unsigned IdxWidth = DL.getIndexSizeInBits(0);
+  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
+  const Value *PtrA1 =
+      Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+
+  while (++It != End && !It->isDebugOrPseudoInst() && MaxLookupDist-- > 0) {
+    if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
+      const Value *PtrB1 =
+          SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
+              DL, OffsetB);
+      if (PtrA1 == PtrB1 &&
+          (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
+                  .abs() == 16)
+        return true;
+    }
+  }
+
+  return false;
+}
+
 /// Lower an interleaved store into a stN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
@@ -15327,8 +15350,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     return false;
   }
   // A 64bit st2 which does not start at element 0 will involved adding extra
-  // ext elements, making the st2 unprofitable.
-  if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0)
+  // ext elements making the st2 unprofitable, and if there is a nearby store
+  // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
+  // zip;ldp pair which has higher throughput.
+  if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
+      (Mask[0] != 0 ||
+       hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
+                            DL) ||
+       hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
+                            BaseAddr, DL)))
     return false;
 
   Type *PtrTy = SI->getPointerOperandType();
diff --git a/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll b/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
index d12240a9f4f3201..7f678d2c91dccc5 100644
--- a/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
+++ b/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
@@ -1,26 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-BASE
-; RUN: llc -mtriple aarch64-none-linux-gnu -aggressive-machine-cse < %s | FileCheck %s --check-prefixes=CHECK-AGGRESSIVE-CSE
+; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
 
 define void @foo(ptr %buf, <8 x i16> %a) {
-; CHECK-BASE-LABEL: foo:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-BASE-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-BASE-NEXT:    zip2 v2.8h, v0.8h, v2.8h
-; CHECK-BASE-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-BASE-NEXT:    st2 { v0.4h, v1.4h }, [x0], #16
-; CHECK-BASE-NEXT:    str q2, [x0]
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-AGGRESSIVE-CSE-LABEL: foo:
-; CHECK-AGGRESSIVE-CSE:       // %bb.0: // %entry
-; CHECK-AGGRESSIVE-CSE-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-AGGRESSIVE-CSE-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-AGGRESSIVE-CSE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; CHECK-AGGRESSIVE-CSE-NEXT:    st2 { v0.4h, v1.4h }, [x0], #16
-; CHECK-AGGRESSIVE-CSE-NEXT:    str q2, [x0]
-; CHECK-AGGRESSIVE-CSE-NEXT:    ret
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    zip1 v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    zip2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ret
 entry:
   %vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 5c73ba16972beb7..d5066aafb816d79 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -493,41 +493,38 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
 define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
 ; CHECK-LABEL: transpose_s16_8x8_:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    ldp q4, q5, [x0, #64]
-; CHECK-NEXT:    mov x9, x0
-; CHECK-NEXT:    ldr q0, [x8, #16]!
-; CHECK-NEXT:    mov x10, x0
-; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    trn1 v16.8h, v0.8h, v1.8h
+; CHECK-NEXT:    trn2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    trn1 v1.8h, v2.8h, v3.8h
+; CHECK-NEXT:    trn2 v2.8h, v2.8h, v3.8h
 ; CHECK-NEXT:    trn1 v17.8h, v4.8h, v5.8h
-; CHECK-NEXT:    ldr q1, [x9, #32]!
-; CHECK-NEXT:    trn1 v16.8h, v3.8h, v0.8h
-; CHECK-NEXT:    ldr q2, [x10, #48]!
-; CHECK-NEXT:    trn2 v4.8h, v4.8h, v5.8h
-; CHECK-NEXT:    trn1 v19.8h, v6.8h, v7.8h
-; CHECK-NEXT:    trn2 v0.8h, v3.8h, v0.8h
-; CHECK-NEXT:    trn2 v3.8h, v6.8h, v7.8h
-; CHECK-NEXT:    trn1 v18.8h, v1.8h, v2.8h
-; CHECK-NEXT:    trn2 v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    trn2 v3.8h, v4.8h, v5.8h
+; CHECK-NEXT:    trn1 v18.8h, v6.8h, v7.8h
+; CHECK-NEXT:    trn2 v4.8h, v6.8h, v7.8h
 ; CHECK-NEXT:    trn1 v5.4s, v16.4s, v17.4s
+; CHECK-NEXT:    trn1 v7.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
-; CHECK-NEXT:    trn1 v20.4s, v0.4s, v4.4s
-; CHECK-NEXT:    trn1 v6.4s, v18.4s, v19.4s
-; CHECK-NEXT:    trn2 v17.4s, v18.4s, v19.4s
-; CHECK-NEXT:    trn2 v18.4s, v0.4s, v4.4s
-; CHECK-NEXT:    trn1 v21.4s, v1.4s, v3.4s
-; CHECK-NEXT:    trn2 v19.4s, v1.4s, v3.4s
-; CHECK-NEXT:    zip2 v0.4s, v5.4s, v6.4s
-; CHECK-NEXT:    zip2 v2.4s, v16.4s, v17.4s
-; CHECK-NEXT:    st2 { v5.2s, v6.2s }, [x0]
-; CHECK-NEXT:    zip2 v1.4s, v20.4s, v21.4s
-; CHECK-NEXT:    zip2 v3.4s, v18.4s, v19.4s
-; CHECK-NEXT:    st2 { v20.2s, v21.2s }, [x8]
-; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x9]
-; CHECK-NEXT:    st2 { v18.2s, v19.2s }, [x10]
-; CHECK-NEXT:    stp q0, q1, [x0, #64]
-; CHECK-NEXT:    stp q2, q3, [x0, #96]
+; CHECK-NEXT:    trn1 v6.4s, v1.4s, v18.4s
+; CHECK-NEXT:    trn1 v19.4s, v2.4s, v4.4s
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v18.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    trn2 v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    zip1 v3.4s, v5.4s, v6.4s
+; CHECK-NEXT:    zip1 v4.4s, v7.4s, v19.4s
+; CHECK-NEXT:    zip1 v17.4s, v16.4s, v1.4s
+; CHECK-NEXT:    zip1 v18.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip2 v5.4s, v5.4s, v6.4s
+; CHECK-NEXT:    zip2 v1.4s, v16.4s, v1.4s
+; CHECK-NEXT:    zip2 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q3, q4, [x0]
+; CHECK-NEXT:    zip2 v3.4s, v7.4s, v19.4s
+; CHECK-NEXT:    stp q17, q18, [x0, #32]
+; CHECK-NEXT:    stp q1, q0, [x0, #96]
+; CHECK-NEXT:    stp q5, q3, [x0, #64]
 ; CHECK-NEXT:    ret
   %2 = load <8 x i16>, ptr %0, align 16
   %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1