[llvm] [AArch64] Disable consecutive store merging when Neon is unavailable (PR #111519)

Wed Oct 9 05:18:47 PDT 2024

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/111519

>From 8e1da8ed192a1ac3d2fbb7e1ff4dc02fc5900474 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 7 Oct 2024 16:26:58 +0000
Subject: [PATCH 1/5] Precommit `consecutive-stores-of-faddv.ll`

---
 .../AArch64/consecutive-stores-of-faddv.ll    | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll

diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
new file mode 100644
index 00000000000000..791399c41e1e07
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
+
+; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
+; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
+; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
+; some cases as it can lead to worse codegen.
+
+define void @consecutive_stores_pair(ptr noalias %dest0, ptr noalias %src0) {
+; CHECK-LABEL: consecutive_stores_pair:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  ret void
+}
+
+define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0) {
+; CHECK-LABEL: consecutive_stores_quadruple:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #3, mul vl]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-NEXT:    stp d0, d2, [x0]
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
+  %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %src2 = getelementptr inbounds float, ptr %src1, i64 %c4_vscale
+  %src3 = getelementptr inbounds float, ptr %src2, i64 %c4_vscale
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %vec2 = load <vscale x 4 x float>, ptr %src2, align 4
+  %vec3 = load <vscale x 4 x float>, ptr %src3, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec3)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  store float %reduce2, ptr %dest2, align 4
+  store float %reduce3, ptr %dest3, align 4
+  ret void
+}
+
+define void @consecutive_stores_pair_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled"  {
+; CHECK-LABEL: consecutive_stores_pair_streaming_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    stp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  ret void
+}
+
+define void @consecutive_stores_quadruple_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #3, mul vl]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    stp s0, s1, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr d0, [sp]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    stp s3, s2, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    str d0, [x0, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = shl i64 %vscale, 2
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
+  %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
+  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
+  %src2 = getelementptr inbounds float, ptr %src1, i64 %c4_vscale
+  %src3 = getelementptr inbounds float, ptr %src2, i64 %c4_vscale
+  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
+  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
+  %vec2 = load <vscale x 4 x float>, ptr %src2, align 4
+  %vec3 = load <vscale x 4 x float>, ptr %src3, align 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec3)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  store float %reduce2, ptr %dest2, align 4
+  store float %reduce3, ptr %dest3, align 4
+  ret void
+}
+
+attributes #0 = { vscale_range(1, 16) "target-features"="+sve,+sme" }

>From b0e3b9d57afe91e2d1e7add9b06b347cb32db12f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 8 Oct 2024 13:50:45 +0000
Subject: [PATCH 2/5] Tidy up tests

---
 .../AArch64/consecutive-stores-of-faddv.ll    | 93 ++++++-------------
 1 file changed, 26 insertions(+), 67 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
index 791399c41e1e07..764296d76fe0ab 100644
--- a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -1,44 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
+; RUN: llc  -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
 
 ; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
 ; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
 ; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
 ; some cases as it can lead to worse codegen.
 
-define void @consecutive_stores_pair(ptr noalias %dest0, ptr noalias %src0) {
+; TODO: A single `stp s0, s1, [x0]` may be preferred here.
+define void @consecutive_stores_pair(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) {
 ; CHECK-LABEL: consecutive_stores_pair:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-  %vscale = call i64 @llvm.vscale.i64()
-  %c4_vscale = shl i64 %vscale, 2
-  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
-  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
-  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
-  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
-  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
   store float %reduce0, ptr %dest0, align 4
   store float %reduce1, ptr %dest1, align 4
   ret void
 }
 
-define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0) {
+define void @consecutive_stores_quadruple(ptr %dest0,
 ; CHECK-LABEL: consecutive_stores_quadruple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #3, mul vl]
 ; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    faddv s2, p0, z2.s
@@ -47,23 +36,15 @@ define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0)
 ; CHECK-NEXT:    mov v2.s[1], v3.s[0]
 ; CHECK-NEXT:    stp d0, d2, [x0]
 ; CHECK-NEXT:    ret
-  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-  %vscale = call i64 @llvm.vscale.i64()
-  %c4_vscale = shl i64 %vscale, 2
+  <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3)
+{
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
   %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
   %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
-  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
-  %src2 = getelementptr inbounds float, ptr %src1, i64 %c4_vscale
-  %src3 = getelementptr inbounds float, ptr %src2, i64 %c4_vscale
-  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
-  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
-  %vec2 = load <vscale x 4 x float>, ptr %src2, align 4
-  %vec3 = load <vscale x 4 x float>, ptr %src3, align 4
-  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
-  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
-  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec2)
-  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec3)
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
   store float %reduce0, ptr %dest0, align 4
   store float %reduce1, ptr %dest1, align 4
   store float %reduce2, ptr %dest2, align 4
@@ -71,78 +52,56 @@ define void @consecutive_stores_quadruple(ptr noalias %dest0, ptr noalias %src0)
   ret void
 }
 
-define void @consecutive_stores_pair_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled"  {
+define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) "aarch64_pstate_sm_enabled"  {
 ; CHECK-LABEL: consecutive_stores_pair_streaming_function:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
-; CHECK-NEXT:    stp s1, s0, [sp, #8]
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    stp s0, s1, [sp, #8]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-  %vscale = call i64 @llvm.vscale.i64()
-  %c4_vscale = shl i64 %vscale, 2
-  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
-  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
-  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
-  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
-  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
   store float %reduce0, ptr %dest0, align 4
   store float %reduce1, ptr %dest1, align 4
   ret void
 }
 
-define void @consecutive_stores_quadruple_streaming_function(ptr noalias %dest0, ptr noalias %src0) #0 "aarch64_pstate_sm_enabled" {
+define void @consecutive_stores_quadruple_streaming_function(ptr %dest0,
 ; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, #3, mul vl]
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1, #2, mul vl]
 ; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
-; CHECK-NEXT:    faddv s2, p0, z2.s
 ; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
 ; CHECK-NEXT:    stp s0, s1, [sp, #-16]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr d0, [sp]
 ; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    stp s3, s2, [sp, #8]
+; CHECK-NEXT:    stp s2, s3, [sp, #8]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    str d0, [x0, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-  %ptrue = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-  %vscale = call i64 @llvm.vscale.i64()
-  %c4_vscale = shl i64 %vscale, 2
+  <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled"
+{
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
   %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
   %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
-  %src1 = getelementptr inbounds float, ptr %src0, i64 %c4_vscale
-  %src2 = getelementptr inbounds float, ptr %src1, i64 %c4_vscale
-  %src3 = getelementptr inbounds float, ptr %src2, i64 %c4_vscale
-  %vec0 = load <vscale x 4 x float>, ptr %src0, align 4
-  %vec1 = load <vscale x 4 x float>, ptr %src1, align 4
-  %vec2 = load <vscale x 4 x float>, ptr %src2, align 4
-  %vec3 = load <vscale x 4 x float>, ptr %src3, align 4
-  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec0)
-  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec1)
-  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec2)
-  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %ptrue, <vscale x 4 x float> %vec3)
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
   store float %reduce0, ptr %dest0, align 4
   store float %reduce1, ptr %dest1, align 4
   store float %reduce2, ptr %dest2, align 4
   store float %reduce3, ptr %dest3, align 4
   ret void
 }
-
-attributes #0 = { vscale_range(1, 16) "target-features"="+sve,+sme" }

>From a635d7a6c9fc62ea58c2ebf111835493f3791784 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 8 Oct 2024 10:28:37 +0000
Subject: [PATCH 3/5] [AArch64] Disable consecutive store merging when Neon is
 unavailable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lowering fixed-size BUILD_VECTORS without Neon may introduce stack
spills, leading to more stores/reloads than if the stores were not
merged. In some cases, it can also prevent using paired store
instructions.

In the future, we may want to relax when SVE is available, but
currently, the SVE lowerings for BUILD_VECTOR are limited to a few
specific cases.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 17 +++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 11 +---------
 .../AArch64/consecutive-stores-of-faddv.ll    | 21 +++++--------------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 288fd3639e5eb7..ed56960a568d8f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27924,6 +27924,23 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   return OptSize && !VT.isVector();
 }
 
+bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                                             const MachineFunction &MF) const {
+  // Avoid merging stores into fixed-length vectors when Neon is unavailable.
+  // Until we have more general SVE lowerings for BUILD_VECTOR this may
+  // introduce stack spills.
+  if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
+    return false;
+
+  // Do not merge to float value size (128 bytes) if no implicit
+  // float attribute is set.
+  bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
+
+  if (NoFloat)
+    return (MemVT.getSizeInBits() <= 64);
+  return true;
+}
+
 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   // We want inc-of-add for scalars and sub-of-not for vectors.
   return VT.isScalarInteger();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1bae7562f459a5..be8ab5ee76a05e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -849,16 +849,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                        const MachineFunction &MF) const override {
-    // Do not merge to float value size (128 bytes) if no implicit
-    // float attribute is set.
-
-    bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
-
-    if (NoFloat)
-      return (MemVT.getSizeInBits() <= 64);
-    return true;
-  }
+                        const MachineFunction &MF) const override;
 
   bool isCheapToSpeculateCttz(Type *) const override {
     return true;
diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
index 764296d76fe0ab..86d9733a88c749 100644
--- a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -55,15 +55,10 @@ define void @consecutive_stores_quadruple(ptr %dest0,
 define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) "aarch64_pstate_sm_enabled"  {
 ; CHECK-LABEL: consecutive_stores_pair_streaming_function:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    faddv s0, p0, z0.s
-; CHECK-NEXT:    stp s0, s1, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    stp s0, s1, [x0]
 ; CHECK-NEXT:    ret
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
   %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
@@ -79,16 +74,10 @@ define void @consecutive_stores_quadruple_streaming_function(ptr %dest0,
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    faddv s0, p0, z0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
-; CHECK-NEXT:    faddv s3, p0, z3.s
 ; CHECK-NEXT:    faddv s2, p0, z2.s
-; CHECK-NEXT:    stp s0, s1, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr d0, [sp]
-; CHECK-NEXT:    str d0, [x0]
-; CHECK-NEXT:    stp s2, s3, [sp, #8]
-; CHECK-NEXT:    ldr d0, [sp, #8]
-; CHECK-NEXT:    str d0, [x0, #8]
-; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    stp s0, s1, [x0]
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    stp s2, s3, [x0, #8]
 ; CHECK-NEXT:    ret
   <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled"
 {

>From b1a66c5c4a92334cfd87388d52339a6bfe432720 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 8 Oct 2024 11:45:45 +0000
Subject: [PATCH 4/5] Fixup comment

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ed56960a568d8f..2f2d18303334e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27927,8 +27927,9 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
 bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                                              const MachineFunction &MF) const {
   // Avoid merging stores into fixed-length vectors when Neon is unavailable.
-  // Until we have more general SVE lowerings for BUILD_VECTOR this may
-  // introduce stack spills.
+  // In future, we could allow this when SVE is available, but currently,
+  // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
+  // the general lowering may introduce stack spills/reloads).
   if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
     return false;
 

>From 270da88e74afc9990b4803f9a396fa36437f705d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 9 Oct 2024 12:17:14 +0000
Subject: [PATCH 5/5] Fixups

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp          | 5 +----
 llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll | 8 ++------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2f2d18303334e5..cf60cd66253596 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27936,10 +27936,7 @@ bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
   // Do not merge to float value size (128 bytes) if no implicit
   // float attribute is set.
   bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
-
-  if (NoFloat)
-    return (MemVT.getSizeInBits() <= 64);
-  return true;
+  return !NoFloat || MemVT.getSizeInBits() <= 64;
 }
 
 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
index 86d9733a88c749..64482e15aed81e 100644
--- a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -24,7 +24,7 @@ define void @consecutive_stores_pair(ptr %dest0, <vscale x 4 x float> %vec0, <vs
   ret void
 }
 
-define void @consecutive_stores_quadruple(ptr %dest0,
+define void @consecutive_stores_quadruple(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) {
 ; CHECK-LABEL: consecutive_stores_quadruple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -36,8 +36,6 @@ define void @consecutive_stores_quadruple(ptr %dest0,
 ; CHECK-NEXT:    mov v2.s[1], v3.s[0]
 ; CHECK-NEXT:    stp d0, d2, [x0]
 ; CHECK-NEXT:    ret
-  <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3)
-{
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
   %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
   %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
@@ -68,7 +66,7 @@ define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4
   ret void
 }
 
-define void @consecutive_stores_quadruple_streaming_function(ptr %dest0,
+define void @consecutive_stores_quadruple_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -79,8 +77,6 @@ define void @consecutive_stores_quadruple_streaming_function(ptr %dest0,
 ; CHECK-NEXT:    faddv s3, p0, z3.s
 ; CHECK-NEXT:    stp s2, s3, [x0, #8]
 ; CHECK-NEXT:    ret
-  <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled"
-{
   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
   %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
   %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4