[llvm] [AArch64] Don't generate st2 for 64bit store that can use stp (PR #69901)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 23 00:37:09 PDT 2023
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/69901
D142966 made it so that st2 that do not start at element 0 use zip2 instead of st2. This extends that to any 64bit store that has a nearby load that can better become a LDP operation, which is expected to have a higher throughput. It searches up to 20 instructions away for a store to p+16 or p-16.
>From d4cbab16748f0b51d739678f22e6b8da63bbf202 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 23 Oct 2023 08:32:19 +0100
Subject: [PATCH] [AArch64] Don't generate st2 for 64bit store that can use stp
D142966 made it so that st2 that do not start at element 0 use zip2 instead of
st2. This extends that to any 64bit store that has a nearby load that can
better become a LDP operation, which is expected to have a higher throughput.
It searches up to 20 instructions away for a store to p+16 or p-16.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 34 ++++++++++-
.../AArch64/machine-cse-profitable-check.ll | 28 +++------
llvm/test/CodeGen/AArch64/vldn_shuffle.ll | 57 +++++++++----------
3 files changed, 67 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a16a102e472e709..8cf1f95c8b7ef05 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15236,6 +15236,29 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
return true;
}
+template <typename Iter>
+bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
+ int MaxLookupDist = 20;
+ unsigned IdxWidth = DL.getIndexSizeInBits(0);
+ APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
+ const Value *PtrA1 =
+ Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+
+ while (++It != End && !It->isDebugOrPseudoInst() && MaxLookupDist-- > 0) {
+ if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
+ const Value *PtrB1 =
+ SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
+ DL, OffsetB);
+ if (PtrA1 == PtrB1 &&
+ (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
+ .abs() == 16)
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// Lower an interleaved store into a stN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
@@ -15327,8 +15350,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
return false;
}
// A 64bit st2 which does not start at element 0 will involved adding extra
- // ext elements, making the st2 unprofitable.
- if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0)
+ // ext elements making the st2 unprofitable, and if there is a nearby store
+ // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
+ // zip;ldp pair which has higher throughput.
+ if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
+ (Mask[0] != 0 ||
+ hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(),
+ BaseAddr, DL) ||
+ hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
+ BaseAddr, DL)))
return false;
Type *PtrTy = SI->getPointerOperandType();
diff --git a/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll b/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
index d12240a9f4f3201..7f678d2c91dccc5 100644
--- a/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
+++ b/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
@@ -1,26 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-BASE
-; RUN: llc -mtriple aarch64-none-linux-gnu -aggressive-machine-cse < %s | FileCheck %s --check-prefixes=CHECK-AGGRESSIVE-CSE
+; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
define void @foo(ptr %buf, <8 x i16> %a) {
-; CHECK-BASE-LABEL: foo:
-; CHECK-BASE: // %bb.0: // %entry
-; CHECK-BASE-NEXT: movi v2.2d, #0000000000000000
-; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-BASE-NEXT: zip2 v2.8h, v0.8h, v2.8h
-; CHECK-BASE-NEXT: movi v1.2d, #0000000000000000
-; CHECK-BASE-NEXT: st2 { v0.4h, v1.4h }, [x0], #16
-; CHECK-BASE-NEXT: str q2, [x0]
-; CHECK-BASE-NEXT: ret
-;
-; CHECK-AGGRESSIVE-CSE-LABEL: foo:
-; CHECK-AGGRESSIVE-CSE: // %bb.0: // %entry
-; CHECK-AGGRESSIVE-CSE-NEXT: // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-AGGRESSIVE-CSE-NEXT: movi v1.2d, #0000000000000000
-; CHECK-AGGRESSIVE-CSE-NEXT: zip2 v2.8h, v0.8h, v1.8h
-; CHECK-AGGRESSIVE-CSE-NEXT: st2 { v0.4h, v1.4h }, [x0], #16
-; CHECK-AGGRESSIVE-CSE-NEXT: str q2, [x0]
-; CHECK-AGGRESSIVE-CSE-NEXT: ret
+; CHECK-LABEL: foo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h
+; CHECK-NEXT: zip2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: ret
entry:
%vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 5c73ba16972beb7..d5066aafb816d79 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -493,41 +493,38 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
; CHECK-LABEL: transpose_s16_8x8_:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ldp q2, q3, [x0, #32]
; CHECK-NEXT: ldp q4, q5, [x0, #64]
-; CHECK-NEXT: mov x9, x0
-; CHECK-NEXT: ldr q0, [x8, #16]!
-; CHECK-NEXT: mov x10, x0
-; CHECK-NEXT: ldr q3, [x0]
; CHECK-NEXT: ldp q6, q7, [x0, #96]
+; CHECK-NEXT: trn1 v16.8h, v0.8h, v1.8h
+; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h
+; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h
; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h
-; CHECK-NEXT: ldr q1, [x9, #32]!
-; CHECK-NEXT: trn1 v16.8h, v3.8h, v0.8h
-; CHECK-NEXT: ldr q2, [x10, #48]!
-; CHECK-NEXT: trn2 v4.8h, v4.8h, v5.8h
-; CHECK-NEXT: trn1 v19.8h, v6.8h, v7.8h
-; CHECK-NEXT: trn2 v0.8h, v3.8h, v0.8h
-; CHECK-NEXT: trn2 v3.8h, v6.8h, v7.8h
-; CHECK-NEXT: trn1 v18.8h, v1.8h, v2.8h
-; CHECK-NEXT: trn2 v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h
+; CHECK-NEXT: trn1 v18.8h, v6.8h, v7.8h
+; CHECK-NEXT: trn2 v4.8h, v6.8h, v7.8h
; CHECK-NEXT: trn1 v5.4s, v16.4s, v17.4s
+; CHECK-NEXT: trn1 v7.4s, v0.4s, v3.4s
; CHECK-NEXT: trn2 v16.4s, v16.4s, v17.4s
-; CHECK-NEXT: trn1 v20.4s, v0.4s, v4.4s
-; CHECK-NEXT: trn1 v6.4s, v18.4s, v19.4s
-; CHECK-NEXT: trn2 v17.4s, v18.4s, v19.4s
-; CHECK-NEXT: trn2 v18.4s, v0.4s, v4.4s
-; CHECK-NEXT: trn1 v21.4s, v1.4s, v3.4s
-; CHECK-NEXT: trn2 v19.4s, v1.4s, v3.4s
-; CHECK-NEXT: zip2 v0.4s, v5.4s, v6.4s
-; CHECK-NEXT: zip2 v2.4s, v16.4s, v17.4s
-; CHECK-NEXT: st2 { v5.2s, v6.2s }, [x0]
-; CHECK-NEXT: zip2 v1.4s, v20.4s, v21.4s
-; CHECK-NEXT: zip2 v3.4s, v18.4s, v19.4s
-; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x8]
-; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x9]
-; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x10]
-; CHECK-NEXT: stp q0, q1, [x0, #64]
-; CHECK-NEXT: stp q2, q3, [x0, #96]
+; CHECK-NEXT: trn1 v6.4s, v1.4s, v18.4s
+; CHECK-NEXT: trn1 v19.4s, v2.4s, v4.4s
+; CHECK-NEXT: trn2 v1.4s, v1.4s, v18.4s
+; CHECK-NEXT: trn2 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: trn2 v2.4s, v2.4s, v4.4s
+; CHECK-NEXT: zip1 v3.4s, v5.4s, v6.4s
+; CHECK-NEXT: zip1 v4.4s, v7.4s, v19.4s
+; CHECK-NEXT: zip1 v17.4s, v16.4s, v1.4s
+; CHECK-NEXT: zip1 v18.4s, v0.4s, v2.4s
+; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s
+; CHECK-NEXT: zip2 v1.4s, v16.4s, v1.4s
+; CHECK-NEXT: zip2 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: stp q3, q4, [x0]
+; CHECK-NEXT: zip2 v3.4s, v7.4s, v19.4s
+; CHECK-NEXT: stp q17, q18, [x0, #32]
+; CHECK-NEXT: stp q1, q0, [x0, #96]
+; CHECK-NEXT: stp q5, q3, [x0, #64]
; CHECK-NEXT: ret
%2 = load <8 x i16>, ptr %0, align 16
%3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
More information about the llvm-commits
mailing list