[llvm] [AArch64] Disable consecutive store merging when Neon is unavailable (PR #111519)

Tue Oct 8 04:03:55 PDT 2024

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/111519

Lowering fixed-size BUILD_VECTORS without Neon may introduce stack
spills, leading to more stores/reloads than if the stores were not
merged. In some cases, it can also prevent using paired store
instructions.

In the future, we may want to relax when SVE is available, but
currently, the SVE lowerings for BUILD_VECTOR are limited to a few
specific cases.

>From e573b7e4b72c7b912ce75336bc80b58993b384b5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 7 Oct 2024 16:26:58 +0000
Subject: [PATCH 1/2] Precommit `consecutive-stores-of-faddv.ll`

---
 .../AArch64/consecutive-stores-of-faddv.ll    | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll

diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
new file mode 100644
index 00000000000000..791399c41e1e07
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
+
+; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
+; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
+; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
+; some cases as it can lead to worse codegen.
+
+define void @consecutive_stores_pair(ptr noalias %dest0, ptr noalias %src0) {
+; CHECK-LABEL: consecutive_stores_pair:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  ret void
+}
+
+define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0) {
+; CHECK-LABEL: consecutive_stores_quadruple:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #3, mul vl]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-NEXT:    stp d0, d2, [x0]
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
+  %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %src2 = getelementptr inbounds float, ptr %src1, i64 %c4_vscale
+  %src3 = getelementptr inbounds float, ptr %src2, i64 %c4_vscale
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %vec2 = load <vscale x 4 x float>, ptr %src2, align 4
+  %vec3 = load <vscale x 4 x float>, ptr %src3, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec3)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  store float %reduce2, ptr %dest2, align 4
+  store float %reduce3, ptr %dest3, align 4
+  ret void
+}
+
+define void @consecutive_stores_pair_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled"  {
+; CHECK-LABEL: consecutive_stores_pair_streaming_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    stp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  ret void
+}
+
+define void @consecutive_stores_quadruple_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #3, mul vl]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    stp s0, s1, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr d0, [sp]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    stp s3, s2, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    str d0, [x0, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
+  %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %src2 = getelementptr inbounds float, ptr %src1, i64 %c4_vscale
+  %src3 = getelementptr inbounds float, ptr %src2, i64 %c4_vscale
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %vec2 = load <vscale x 4 x float>, ptr %src2, align 4
+  %vec3 = load <vscale x 4 x float>, ptr %src3, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec3)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  store float %reduce2, ptr %dest2, align 4
+  store float %reduce3, ptr %dest3, align 4
+  ret void
+}
+
+attributes #0 = { vscale_range(1, 16) "target-features"="+sve,+sme" }

>From 321290a49d409620a8072f5bd8d8f917b215d09f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 8 Oct 2024 10:28:37 +0000
Subject: [PATCH 2/2] [AArch64] Disable consecutive store merging when Neon is
 unavailable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lowering fixed-size BUILD_VECTORS without Neon may introduce stack
spills, leading to more stores/reloads than if the stores were not
merged. In some cases, it can also prevent using paired store
instructions.

In the future, we may want to relax when SVE is available, but
currently, the SVE lowerings for BUILD_VECTOR are limited to a few
specific cases.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 17 ++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 11 +-------
 .../AArch64/consecutive-stores-of-faddv.ll    | 26 ++++++-------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48e1b96d841efb..6e19bf1b4b175a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27879,6 +27879,23 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   return OptSize && !VT.isVector();
 }
 
+bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                                             const MachineFunction &MF) const {
+  // Avoid merging stores into fixed-length vectors when Neon is unavailable.
+  // Until we have more general SVE lowerings for BUILD_VECTOR this may
+  // introduce stack spills.
+  if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
+    return false;
+
+  // Do not merge to float value size (128 bytes) if no implicit
+  // float attribute is set.
+  bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
+
+  if (NoFloat)
+    return (MemVT.getSizeInBits() <= 64);
+  return true;
+}
+
 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   // We want inc-of-add for scalars and sub-of-not for vectors.
   return VT.isScalarInteger();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 480bf60360bf55..04ab5d974ccbf0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -848,16 +848,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                        const MachineFunction &MF) const override {
-    // Do not merge to float value size (128 bytes) if no implicit
-    // float attribute is set.
-
-    bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
-
-    if (NoFloat)
-      return (MemVT.getSizeInBits() <= 64);
-    return true;
-  }
+                        const MachineFunction &MF) const override;
 
   bool isCheapToSpeculateCttz(Type *) const override {
     return true;
diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
index 791399c41e1e07..13ef983501d26f 100644
--- a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -6,6 +6,7 @@
 ; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
 ; some cases as it can lead to worse codegen.
 
+; TODO: A single `stp s0, s1, [x0]` may be preferred here.
 define void @consecutive_stores_pair(ptr noalias %dest0, ptr noalias %src0) {
 ; CHECK-LABEL: consecutive_stores_pair:
 ; CHECK:       // %bb.0:
@@ -74,17 +75,12 @@ define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0)
 define void @consecutive_stores_pair_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled"  {
 ; CHECK-LABEL: consecutive_stores_pair_streaming_function:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
-; CHECK-NEXT:    stp s1, s0, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    stp s0, s1, [x0]
 ; CHECK-NEXT:    ret
   %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %vscale = call i64 @llvm.vscale.i64()
@@ -106,20 +102,14 @@ define void @consecutive_stores_quadruple_streaming_function(ptr noalias %dest0,
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #3, mul vl]
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #3, mul vl]
 ; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    stp s0, s1, [x0]
 ; CHECK-NEXT:    faddv s3, p0, z3.s
-; CHECK-NEXT:    stp s0, s1, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr d0, [sp]
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    stp s3, s2, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    str d0, [x0, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    stp s2, s3, [x0, #8]
 ; CHECK-NEXT:    ret
   %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %vscale = call i64 @llvm.vscale.i64()