[llvm] [AArch64][SVE] Fold integer lane extract and store to FPR store (PR #129756)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 6 11:40:19 PST 2025


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/129756

>From 594347ddd1726985222da07755af2ed3fa13f2df Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 4 Mar 2025 16:18:34 +0000
Subject: [PATCH 1/7] [AArch64][SVE] Fold integer lane 0 extract and store to
 FPR store

This helps avoid some pointless fmovs to GPRs, which may be slow in
streaming mode.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   4 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  32 ++
 .../CodeGen/AArch64/aarch64-sve-ldst-one.ll   | 328 ++++++++++++++++++
 ...plex-deinterleaving-reductions-scalable.ll |   5 +-
 ...sve-streaming-mode-fixed-length-bitcast.ll |   3 +-
 ...e-streaming-mode-fixed-length-ext-loads.ll |  32 +-
 ...-streaming-mode-fixed-length-ld2-alloca.ll |  11 +-
 ...mode-fixed-length-masked-gather-scatter.ll |   5 +-
 ...eaming-mode-fixed-length-optimize-ptrue.ll |   3 +-
 .../sve-streaming-mode-fixed-length-stores.ll |   3 +-
 10 files changed, 390 insertions(+), 36 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c836f3138a45f..3bfc1a922357a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4581,8 +4581,6 @@ let Predicates = [IsLE] in {
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
-} // AddedComplexity = 10
-
 // unscaled i64 truncating stores
 def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
   (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4591,6 +4589,8 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
 def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
   (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 
+} // AddedComplexity = 10
+
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
                              ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4365e573d8b16..c5a246296ae0b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1988,6 +1988,38 @@ let Predicates = [HasSVE_or_SME] in {
   def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
+  // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+  multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+                              ValueType VTy, ValueType STy,
+                              ValueType SubRegTy,
+                              SubRegIndex SubRegIdx, Operand IndexType,
+                              Instruction STR> {
+    def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+                      (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+              (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+                  GPR64sp:$Rn, IndexType:$offset)>;
+  }
+
+  let AddedComplexity = 19 in {
+    // Lane 0 truncating stores
+    // i32 -> i16
+    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi>;
+    // i64 -> i32
+    defm : SVEVecStoreLane0Pat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi>;
+    // i64 -> i16
+    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi>;
+    // i16 -> i16 (technically a truncate as the extracted type is i32)
+    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
+
+    // Lane 0 stores
+    defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
+    defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+  }
+
   // Insert subvectors into FP SVE vectors.
   foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in
     foreach idx = [0, 2] in
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
new file mode 100644
index 0000000000000..22b136ac194cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; TODO: Improve codegen for non-zero extract indices.
+
+define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    str w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 3
+  store i32 %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane0_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 0
+  store i32 %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT:    fmov x8, d0
+; STREAMING-COMPAT-NEXT:    str x8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x i64> %b, i32 1
+  store i64 %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane0_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane0_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x i64> %b, i32 0
+  store i64 %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x float> %b, i32 3
+  store float %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane0_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane0_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x float> %b, i32 0
+  store float %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x double> %b, i32 1
+  store double %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane0_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane0_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x double> %b, i32 0
+  store double %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w8, v0.b[7]
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    strb w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 16 x i8> %b, i32 7
+  store i8 %0, ptr %a, align 1
+  ret void
+}
+
+define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w8, v0.h[3]
+; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s16:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.h, z0.h[3]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    strh w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 8 x i16> %b, i32 3
+  store i16 %0, ptr %a, align 2
+  ret void
+}
+
+define void @test_str_lane0_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane0_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s16:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 8 x i16> %b, i32 0
+  store i16 %0, ptr %a, align 2
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i32(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i32
+  store i32 %trunc, ptr %ptr, align 4
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i64(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  store i64 %reduce, ptr %ptr, align 8
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i16
+  store i16 %trunc, ptr %ptr, align 2
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    stur s0, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i32
+  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  store i32 %trunc, ptr %out_ptr, align 4
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    stur x8, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    fmov x8, d0
+; STREAMING-COMPAT-NEXT:    stur x8, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  store i64 %reduce, ptr %out_ptr, align 8
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    stur h0, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i16
+  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  store i16 %trunc, ptr %out_ptr, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 668dc18df6a0b..89f790210e193 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -332,15 +332,14 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-NEXT:    add z2.d, z5.d, z2.d
 ; CHECK-NEXT:    b.ne .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
-; CHECK-NEXT:    uaddv d2, p0, z2.d
 ; CHECK-NEXT:    uzp2 z3.d, z1.d, z0.d
 ; CHECK-NEXT:    uzp1 z1.d, z1.d, z0.d
+; CHECK-NEXT:    uaddv d2, p0, z2.d
 ; CHECK-NEXT:    faddv d0, p0, z3.d
-; CHECK-NEXT:    fmov x8, d2
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str s2, [x4]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
-; CHECK-NEXT:    str w8, [x4]
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 6644be11a02ba..ffef6f74f2d36 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -95,8 +95,7 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    str s0, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 7d6336a43a4fd..9e1d342663f0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    mov z1.d, z1.d[1]
+; CHECK-NEXT:    str d1, [x8]
+; CHECK-NEXT:    str d0, [x8, #64]
+; CHECK-NEXT:    fmov x10, d2
 ; CHECK-NEXT:    fmov x11, d0
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    stp x9, x10, [x8]
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    asr x12, x11, #63
-; CHECK-NEXT:    stp x10, x10, [x8, #16]
-; CHECK-NEXT:    stp x11, x12, [x8, #64]
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    stp x12, x12, [x8, #80]
-; CHECK-NEXT:    stp x10, x10, [x8, #48]
-; CHECK-NEXT:    asr x12, x11, #63
-; CHECK-NEXT:    stp x9, x10, [x8, #32]
-; CHECK-NEXT:    stp x12, x12, [x8, #112]
-; CHECK-NEXT:    stp x11, x12, [x8, #96]
+; CHECK-NEXT:    asr x9, x9, #63
+; CHECK-NEXT:    stp x9, x9, [x8, #8]
+; CHECK-NEXT:    asr x11, x11, #63
+; CHECK-NEXT:    stp x9, x10, [x8, #24]
+; CHECK-NEXT:    asr x9, x10, #63
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    stp x11, x11, [x8, #72]
+; CHECK-NEXT:    stp x9, x9, [x8, #48]
+; CHECK-NEXT:    str x9, [x8, #40]
+; CHECK-NEXT:    asr x9, x10, #63
+; CHECK-NEXT:    stp x11, x10, [x8, #88]
+; CHECK-NEXT:    stp x9, x9, [x8, #112]
+; CHECK-NEXT:    str x9, [x8, #104]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 613543310f2c3..aa1adfd306a4c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -75,8 +75,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x8]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strb w8, [x19, #2]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [x19]
+; CHECK-NEXT:    str h1, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
@@ -120,14 +119,12 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    bl def
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
-; CHECK-NEXT:    ldr q0, [sp]
+; CHECK-NEXT:    ldp q0, q2, [sp]
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strb w8, [x19, #8]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    str x8, [x19]
+; CHECK-NEXT:    str d0, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index c8cea6ebabd48..434e24bf48724 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -121,9 +121,8 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_3: // %cond.store
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    fmov x10, d1
-; CHECK-NEXT:    str x9, [x10]
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    str d0, [x9]
 ; CHECK-NEXT:    tbz w8, #1, .LBB1_2
 ; CHECK-NEXT:  .LBB1_4: // %cond.store1
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index 431c5a78202e8..74e5fe7352cfd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -769,8 +769,7 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index b8779991dbb45..17579d79896da 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -93,8 +93,7 @@ define void @store_v2f16(ptr %a) {
 ; CHECK-LABEL: store_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: store_v2f16:

>From 3c7c7270434a711a220563867e091fc5070d996a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 5 Mar 2025 10:17:01 +0000
Subject: [PATCH 2/7] Add missing folds

---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  8 +++--
 .../CodeGen/AArch64/aarch64-sve-ldst-one.ll   | 34 ++++++++++++++-----
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c5a246296ae0b..485d41a80ee34 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2016,8 +2016,12 @@ let Predicates = [HasSVE_or_SME] in {
     defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
 
     // Lane 0 stores
-    defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
-    defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+    // i32
+    defm : SVEVecStoreLane0Pat<am_indexed32,  store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi>;
+    // i64
+    defm : SVEVecStoreLane0Pat<am_indexed64,  store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi>;
   }
 
   // Insert subvectors into FP SVE vectors.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index 22b136ac194cc..c2bd513634b44 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -44,6 +44,24 @@ entry:
   ret void
 }
 
+define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stur s0, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 0
+  %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
+  store i32 %0, ptr %out_ptr, align 4
+  ret void
+}
+
 define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: test_str_lane_s64:
 ; CHECK:       // %bb.0: // %entry
@@ -281,7 +299,7 @@ define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %trunc = trunc i64 %reduce to i32
-  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  %out_ptr = getelementptr inbounds i32, ptr %ptr, i64 -8
   store i32 %trunc, ptr %out_ptr, align 4
   ret void
 }
@@ -290,19 +308,17 @@ define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4
 ; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    stur x8, [x0, #-32]
+; CHECK-NEXT:    stur d0, [x0, #-64]
 ; CHECK-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0:
 ; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    fmov x8, d0
-; STREAMING-COMPAT-NEXT:    stur x8, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    stur d0, [x0, #-64]
 ; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
-  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  %out_ptr = getelementptr inbounds i64, ptr %ptr, i64 -8
   store i64 %reduce, ptr %out_ptr, align 8
   ret void
 }
@@ -311,18 +327,18 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
 ; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    stur h0, [x0, #-32]
+; CHECK-NEXT:    stur h0, [x0, #-16]
 ; CHECK-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0:
 ; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
 ; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %trunc = trunc i64 %reduce to i16
-  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  %out_ptr = getelementptr inbounds i16, ptr %ptr, i64 -8
   store i16 %trunc, ptr %out_ptr, align 2
   ret void
 }

>From 83bd58856b00aeae1ed931448fa039ddbb638ff8 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 5 Mar 2025 13:43:02 +0000
Subject: [PATCH 3/7] Handle a few more cases + more tests

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   1 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  84 +++---
 .../CodeGen/AArch64/aarch64-sve-ldst-one.ll   | 281 ++++++++++++++++--
 ...e-streaming-mode-fixed-length-ext-loads.ll |  28 +-
 ...mode-fixed-length-masked-gather-scatter.ll |   7 +-
 5 files changed, 322 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3bfc1a922357a..018fc1156cf91 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -134,6 +134,7 @@ def HasRDM           : Predicate<"Subtarget->hasRDM()">,
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
 def HasNoFullFP16    : Predicate<"!Subtarget->hasFullFP16()">;
+def HasNoNEON        : Predicate<"!Subtarget->isNeonAvailable()">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 485d41a80ee34..f990a248ddb26 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1820,6 +1820,54 @@ let Predicates = [HasSVE] in {
   defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
 } // End HasSVE
 
+multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+                              ValueType VTy, ValueType STy,
+                              ValueType SubRegTy,
+                              SubRegIndex SubRegIdx, Operand IndexType,
+                              Instruction STR,
+                              Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
+  let Predicates = [HasSVE_or_SME] in {
+    // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+    def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+                        (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+              (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+                    GPR64sp:$Rn, IndexType:$offset)>;
+  }
+
+  // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP.
+  let Predicates = [HasSVE_or_SME, HasNoNEON] in {
+    // Non-zero immediate index:
+    def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
+                        (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+              (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
+                    GPR64sp:$Rn, IndexType:$offset)>;
+  }
+}
+
+let AddedComplexity = 19 in {
+  // Lane 0 truncating stores
+  // i32 -> i16
+  defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+  defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+  // i64 -> i32
+  defm : SVEVecStoreLanePat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  // i64 -> i16
+  defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  // i16 -> i16 (technically a truncate as the extracted type is i32)
+  defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+  defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+
+  // Lane 0 stores
+  // i32
+  defm : SVEVecStoreLanePat<am_indexed32,  store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+  defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+  // i64
+  defm : SVEVecStoreLanePat<am_indexed64,  store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+}
+
 let Predicates = [HasSVE_or_SME] in {
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
@@ -1988,42 +2036,6 @@ let Predicates = [HasSVE_or_SME] in {
   def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
-  // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
-  multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
-                              ValueType VTy, ValueType STy,
-                              ValueType SubRegTy,
-                              SubRegIndex SubRegIdx, Operand IndexType,
-                              Instruction STR> {
-    def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
-                      (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
-              (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
-                  GPR64sp:$Rn, IndexType:$offset)>;
-  }
-
-  let AddedComplexity = 19 in {
-    // Lane 0 truncating stores
-    // i32 -> i16
-    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui>;
-    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi>;
-    // i64 -> i32
-    defm : SVEVecStoreLane0Pat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui>;
-    defm : SVEVecStoreLane0Pat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi>;
-    // i64 -> i16
-    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui>;
-    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi>;
-    // i16 -> i16 (technically a truncate as the extracted type is i32)
-    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui>;
-    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
-
-    // Lane 0 stores
-    // i32
-    defm : SVEVecStoreLane0Pat<am_indexed32,  store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
-    defm : SVEVecStoreLane0Pat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi>;
-    // i64
-    defm : SVEVecStoreLane0Pat<am_indexed64,  store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
-    defm : SVEVecStoreLane0Pat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi>;
-  }
-
   // Insert subvectors into FP SVE vectors.
   foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in
     foreach idx = [0, 2] in
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index c2bd513634b44..7c460f45f7972 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -5,8 +5,6 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; TODO: Improve codegen for non-zero extract indices.
-
 define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: test_str_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -17,8 +15,7 @@ define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
 ; STREAMING-COMPAT-LABEL: test_str_lane_s32:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
 ; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT:    fmov w8, s0
-; STREAMING-COMPAT-NEXT:    str w8, [x0]
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
 ; STREAMING-COMPAT-NEXT:    ret
 
 entry:
@@ -44,24 +41,6 @@ entry:
   ret void
 }
 
-define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_lane0_s32_negative_offset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stur s0, [x0, #-32]
-; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT:    ret
-
-entry:
-  %0 = extractelement <vscale x 4 x i32> %b, i32 0
-  %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
-  store i32 %0, ptr %out_ptr, align 4
-  ret void
-}
-
 define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: test_str_lane_s64:
 ; CHECK:       // %bb.0: // %entry
@@ -72,8 +51,7 @@ define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
 ; STREAMING-COMPAT-LABEL: test_str_lane_s64:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
 ; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT:    fmov x8, d0
-; STREAMING-COMPAT-NEXT:    str x8, [x0]
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
 ; STREAMING-COMPAT-NEXT:    ret
 
 entry:
@@ -191,6 +169,25 @@ entry:
   ret void
 }
 
+define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane0_s8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s8:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    strb w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 16 x i8> %b, i32 0
+  store i8 %0, ptr %a, align 1
+  ret void
+}
+
 define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: test_str_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -201,8 +198,7 @@ define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
 ; STREAMING-COMPAT-LABEL: test_str_lane_s16:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
 ; STREAMING-COMPAT-NEXT:    mov z0.h, z0.h[3]
-; STREAMING-COMPAT-NEXT:    fmov w8, s0
-; STREAMING-COMPAT-NEXT:    strh w8, [x0]
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
 ; STREAMING-COMPAT-NEXT:    ret
 
 entry:
@@ -342,3 +338,236 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
   store i16 %trunc, ptr %out_ptr, align 2
   ret void
 }
+
+define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane_s32_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    stur w8, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 3
+  %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
+  store i32 %0, ptr %out_ptr, align 4
+  ret void
+}
+
+define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stur s0, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 0
+  %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
+  store i32 %0, ptr %out_ptr, align 4
+  ret void
+}
+
+define void @test_str_lane_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane_s64_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    stur x8, [x0, #-64]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT:    stur d0, [x0, #-64]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x i64> %b, i32 1
+  %out_ptr = getelementptr inbounds i64, ptr %a, i64 -8
+  store i64 %0, ptr %out_ptr, align 8
+  ret void
+}
+
+define void @test_str_lane0_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane0_s64_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stur d0, [x0, #-64]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s64_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    stur d0, [x0, #-64]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x i64> %b, i32 0
+  %out_ptr = getelementptr inbounds i64, ptr %a, i64 -8
+  store i64 %0, ptr %out_ptr, align 8
+  ret void
+}
+
+define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane_s8_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w8, v0.b[7]
+; CHECK-NEXT:    sturb w8, [x0, #-8]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    sturb w8, [x0, #-8]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 16 x i8> %b, i32 7
+  %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+  store i8 %0, ptr %out_ptr, align 1
+  ret void
+}
+
+define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane0_s8_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sturb w8, [x0, #-8]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    sturb w8, [x0, #-8]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 16 x i8> %b, i32 0
+  %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+  store i8 %0, ptr %out_ptr, align 1
+  ret void
+}
+
+define void @test_str_lane_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane_s16_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w8, v0.h[3]
+; CHECK-NEXT:    sturh w8, [x0, #-16]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.h, z0.h[3]
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 8 x i16> %b, i32 3
+  %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+  store i16 %0, ptr %out_ptr, align 2
+  ret void
+}
+
+define void @test_str_lane0_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane0_s16_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stur h0, [x0, #-16]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s16_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 8 x i16> %b, i32 0
+  %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+  store i16 %0, ptr %out_ptr, align 2
+  ret void
+}
+
+define void @test_str_trunc_lane_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 3
+  %trunc = trunc i32 %0 to i16
+  store i16 %trunc, ptr %a, align 2
+  ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 0
+  %trunc = trunc i32 %0 to i16
+  store i16 %trunc, ptr %a, align 2
+  ret void
+}
+
+define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    sturh w8, [x0, #-16]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 3
+  %trunc = trunc i32 %0 to i16
+  %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+  store i16 %trunc, ptr %out_ptr, align 2
+  ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stur h0, [x0, #-16]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 0
+  %trunc = trunc i32 %0 to i16
+  %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+  store i16 %trunc, ptr %out_ptr, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 9e1d342663f0f..2c891251befc7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    str d1, [x8]
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    asr x9, x9, #63
+; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    str d0, [x8, #64]
+; CHECK-NEXT:    stp x9, x9, [x8, #16]
+; CHECK-NEXT:    str x9, [x8, #8]
+; CHECK-NEXT:    asr x9, x10, #63
 ; CHECK-NEXT:    fmov x10, d2
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    str d2, [x8, #32]
+; CHECK-NEXT:    stp x9, x9, [x8, #80]
+; CHECK-NEXT:    str x9, [x8, #72]
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    asr x10, x10, #63
+; CHECK-NEXT:    str d1, [x8, #96]
+; CHECK-NEXT:    stp x10, x10, [x8, #48]
 ; CHECK-NEXT:    asr x9, x9, #63
-; CHECK-NEXT:    stp x9, x9, [x8, #8]
-; CHECK-NEXT:    asr x11, x11, #63
-; CHECK-NEXT:    stp x9, x10, [x8, #24]
-; CHECK-NEXT:    asr x9, x10, #63
-; CHECK-NEXT:    fmov x10, d0
-; CHECK-NEXT:    stp x11, x11, [x8, #72]
-; CHECK-NEXT:    stp x9, x9, [x8, #48]
-; CHECK-NEXT:    str x9, [x8, #40]
-; CHECK-NEXT:    asr x9, x10, #63
-; CHECK-NEXT:    stp x11, x10, [x8, #88]
+; CHECK-NEXT:    str x10, [x8, #40]
 ; CHECK-NEXT:    stp x9, x9, [x8, #112]
 ; CHECK-NEXT:    str x9, [x8, #104]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index 434e24bf48724..d9f8482a3c503 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -125,11 +125,10 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
 ; CHECK-NEXT:    str d0, [x9]
 ; CHECK-NEXT:    tbz w8, #1, .LBB1_2
 ; CHECK-NEXT:  .LBB1_4: // %cond.store1
-; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    str x8, [x9]
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    str d0, [x8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;

>From c3c36413f3169312145fd751788360f52e6e168b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 12:10:42 +0000
Subject: [PATCH 4/7] Avoid duplicate test checks

---
 .../CodeGen/AArch64/aarch64-sve-ldst-one.ll   | 216 +++++-------------
 1 file changed, 53 insertions(+), 163 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index 7c460f45f7972..2278bc82fcf6e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[3]
-; CHECK-NEXT:    str w8, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s32:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT:    str w8, [x0]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s32:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -29,11 +29,6 @@ define void @test_str_lane0_s32(ptr %a, <vscale x 4 x i32> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s32:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    str s0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -42,11 +37,11 @@ entry:
 }
 
 define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: test_str_lane_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    str x8, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s64:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    mov x8, v0.d[1]
+; CHECK-NONSTREAMING-NEXT:    str x8, [x0]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s64:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -65,11 +60,6 @@ define void @test_str_lane0_s64(ptr %a, <vscale x 2 x i64> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s64:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    str d0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 2 x i64> %b, i32 0
@@ -83,12 +73,6 @@ define void @test_str_lane_f32(ptr %a, <vscale x 4 x float> %b) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_f32:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT:    str s0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x float> %b, i32 3
@@ -101,11 +85,6 @@ define void @test_str_lane0_f32(ptr %a, <vscale x 4 x float> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_f32:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    str s0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x float> %b, i32 0
@@ -119,12 +98,6 @@ define void @test_str_lane_f64(ptr %a, <vscale x 2 x double> %b) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_f64:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT:    str d0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 2 x double> %b, i32 1
@@ -137,11 +110,6 @@ define void @test_str_lane0_f64(ptr %a, <vscale x 2 x double> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_f64:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    str d0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 2 x double> %b, i32 0
@@ -150,11 +118,11 @@ entry:
 }
 
 define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    umov w8, v0.b[7]
-; CHECK-NEXT:    strb w8, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    umov w8, v0.b[7]
+; CHECK-NONSTREAMING-NEXT:    strb w8, [x0]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s8:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -175,12 +143,6 @@ define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s8:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    fmov w8, s0
-; STREAMING-COMPAT-NEXT:    strb w8, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 16 x i8> %b, i32 0
@@ -189,11 +151,11 @@ entry:
 }
 
 define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: test_str_lane_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    umov w8, v0.h[3]
-; CHECK-NEXT:    strh w8, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s16:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    umov w8, v0.h[3]
+; CHECK-NONSTREAMING-NEXT:    strh w8, [x0]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s16:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -212,11 +174,6 @@ define void @test_str_lane0_s16(ptr %a, <vscale x 8 x i16> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str h0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s16:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    str h0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 8 x i16> %b, i32 0
@@ -230,12 +187,6 @@ define void @test_str_reduction_i32_to_i32(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32:
-; STREAMING-COMPAT:       // %bb.0:
-; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    str s0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %trunc = trunc i64 %reduce to i32
@@ -249,12 +200,6 @@ define void @test_str_reduction_i32_to_i64(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64:
-; STREAMING-COMPAT:       // %bb.0:
-; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    str d0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   store i64 %reduce, ptr %ptr, align 8
@@ -267,12 +212,6 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    str h0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16:
-; STREAMING-COMPAT:       // %bb.0:
-; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    str h0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %trunc = trunc i64 %reduce to i16
@@ -286,12 +225,6 @@ define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    stur s0, [x0, #-32]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset:
-; STREAMING-COMPAT:       // %bb.0:
-; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %trunc = trunc i64 %reduce to i32
@@ -306,12 +239,6 @@ define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    stur d0, [x0, #-64]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
-; STREAMING-COMPAT:       // %bb.0:
-; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    stur d0, [x0, #-64]
-; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %out_ptr = getelementptr inbounds i64, ptr %ptr, i64 -8
@@ -325,12 +252,6 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    stur h0, [x0, #-16]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
-; STREAMING-COMPAT:       // %bb.0:
-; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT:    ret
 
   %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
   %trunc = trunc i64 %reduce to i16
@@ -340,11 +261,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
 }
 
 define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_lane_s32_negative_offset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[3]
-; CHECK-NEXT:    stur w8, [x0, #-32]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT:    stur w8, [x0, #-32]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -364,11 +285,6 @@ define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stur s0, [x0, #-32]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -378,11 +294,11 @@ entry:
 }
 
 define void @test_str_lane_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: test_str_lane_s64_negative_offset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    stur x8, [x0, #-64]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    mov x8, v0.d[1]
+; CHECK-NONSTREAMING-NEXT:    stur x8, [x0, #-64]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -402,11 +318,6 @@ define void @test_str_lane0_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stur d0, [x0, #-64]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s64_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    stur d0, [x0, #-64]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 2 x i64> %b, i32 0
@@ -416,11 +327,11 @@ entry:
 }
 
 define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane_s8_negative_offset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    umov w8, v0.b[7]
-; CHECK-NEXT:    sturb w8, [x0, #-8]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    umov w8, v0.b[7]
+; CHECK-NONSTREAMING-NEXT:    sturb w8, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -442,12 +353,6 @@ define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    sturb w8, [x0, #-8]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    fmov w8, s0
-; STREAMING-COMPAT-NEXT:    sturb w8, [x0, #-8]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 16 x i8> %b, i32 0
@@ -457,11 +362,11 @@ entry:
 }
 
 define void @test_str_lane_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: test_str_lane_s16_negative_offset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    umov w8, v0.h[3]
-; CHECK-NEXT:    sturh w8, [x0, #-16]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    umov w8, v0.h[3]
+; CHECK-NONSTREAMING-NEXT:    sturh w8, [x0, #-16]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -481,11 +386,6 @@ define void @test_str_lane0_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stur h0, [x0, #-16]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s16_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 8 x i16> %b, i32 0
@@ -495,11 +395,11 @@ entry:
 }
 
 define void @test_str_trunc_lane_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane_s32_to_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[3]
-; CHECK-NEXT:    strh w8, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT:    strh w8, [x0]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -519,11 +419,6 @@ define void @test_str_trunc_lane0_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str h0, [x0]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    str h0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -533,11 +428,11 @@ entry:
 }
 
 define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[3]
-; CHECK-NEXT:    sturh w8, [x0, #-16]
-; CHECK-NEXT:    ret
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; CHECK-NONSTREAMING:       // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT:    sturh w8, [x0, #-16]
+; CHECK-NONSTREAMING-NEXT:    ret
 ;
 ; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
 ; STREAMING-COMPAT:       // %bb.0: // %entry
@@ -558,11 +453,6 @@ define void @test_str_trunc_lane0_s32_to_s16_negative_offset(ptr %a, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stur h0, [x0, #-16]
 ; CHECK-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 0

>From 9436f62212c65f24a9ec9dabada49909a06d76a3 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 14:45:17 +0000
Subject: [PATCH 5/7] Fixups

---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  73 +++++-----
 .../CodeGen/AArch64/aarch64-sve-ldst-one.ll   | 128 ++++++------------
 2 files changed, 75 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index f990a248ddb26..72feca3e0e18e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1826,49 +1826,46 @@ multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator store
                               SubRegIndex SubRegIdx, Operand IndexType,
                               Instruction STR,
                               Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
-  let Predicates = [HasSVE_or_SME] in {
-    // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
-    def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
-                        (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
-              (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
-                    GPR64sp:$Rn, IndexType:$offset)>;
-  }
+  // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+  def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+                     (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+            (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+                  GPR64sp:$Rn, IndexType:$offset)>;
 
   // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP.
-  let Predicates = [HasSVE_or_SME, HasNoNEON] in {
-    // Non-zero immediate index:
-    def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
-                        (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
-              (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
-                    GPR64sp:$Rn, IndexType:$offset)>;
-  }
-}
-
-let AddedComplexity = 19 in {
-  // Lane 0 truncating stores
-  // i32 -> i16
-  defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-  defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-  // i64 -> i32
-  defm : SVEVecStoreLanePat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-  defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-  // i64 -> i16
-  defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-  defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-  // i16 -> i16 (technically a truncate as the extracted type is i32)
-  defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
-  defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
-
-  // Lane 0 stores
-  // i32
-  defm : SVEVecStoreLanePat<am_indexed32,  store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-  defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-  // i64
-  defm : SVEVecStoreLanePat<am_indexed64,  store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-  defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  // Non-zero immediate index:
+  def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
+                     (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+            (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
+                  GPR64sp:$Rn, IndexType:$offset)>;
 }
 
 let Predicates = [HasSVE_or_SME] in {
+
+  let AddedComplexity = 19 in {
+    // Lane 0 truncating stores
+    // i32 -> i16
+    defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+    defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+    // i64 -> i32
+    defm : SVEVecStoreLanePat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+    defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+    // i64 -> i16
+    defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+    defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+    // i16 -> i16 (technically a truncate as the extracted type is i32)
+    defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+    defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+
+    // Lane 0 stores
+    // i32
+    defm : SVEVecStoreLanePat<am_indexed32,  store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+    defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+    // i64
+    defm : SVEVecStoreLanePat<am_indexed64,  store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+    defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+  }
+
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index 2278bc82fcf6e..d39c9bf760621 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -6,17 +6,11 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s32:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT:    str w8, [x0]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s32:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT:    str s0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 3
@@ -37,17 +31,11 @@ entry:
 }
 
 define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s64:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    mov x8, v0.d[1]
-; CHECK-NONSTREAMING-NEXT:    str x8, [x0]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s64:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT:    str d0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_lane_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 2 x i64> %b, i32 1
@@ -151,17 +139,11 @@ entry:
 }
 
 define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s16:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    umov w8, v0.h[3]
-; CHECK-NONSTREAMING-NEXT:    strh w8, [x0]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s16:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.h, z0.h[3]
-; STREAMING-COMPAT-NEXT:    str h0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, z0.h[3]
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 8 x i16> %b, i32 3
@@ -261,17 +243,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
 }
 
 define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT:    stur w8, [x0, #-32]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_lane_s32_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    stur s0, [x0, #-32]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 3
@@ -294,17 +270,11 @@ entry:
 }
 
 define void @test_str_lane_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    mov x8, v0.d[1]
-; CHECK-NONSTREAMING-NEXT:    stur x8, [x0, #-64]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT:    stur d0, [x0, #-64]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_lane_s64_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    stur d0, [x0, #-64]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 2 x i64> %b, i32 1
@@ -362,17 +332,11 @@ entry:
 }
 
 define void @test_str_lane_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    umov w8, v0.h[3]
-; CHECK-NONSTREAMING-NEXT:    sturh w8, [x0, #-16]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.h, z0.h[3]
-; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_lane_s16_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, z0.h[3]
+; CHECK-NEXT:    stur h0, [x0, #-16]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 8 x i16> %b, i32 3
@@ -395,17 +359,11 @@ entry:
 }
 
 define void @test_str_trunc_lane_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT:    strh w8, [x0]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT:    str h0, [x0]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 3
@@ -428,17 +386,11 @@ entry:
 }
 
 define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
-; CHECK-NONSTREAMING:       // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT:    mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT:    sturh w8, [x0, #-16]
-; CHECK-NONSTREAMING-NEXT:    ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
-; STREAMING-COMPAT:       // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT:    ret
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    stur h0, [x0, #-16]
+; CHECK-NEXT:    ret
 
 entry:
   %0 = extractelement <vscale x 4 x i32> %b, i32 3

>From 468aca4412c6ad56e74f00bafda570f9207975a6 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 14:50:59 +0000
Subject: [PATCH 6/7] Fixups

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 018fc1156cf91..3bfc1a922357a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -134,7 +134,6 @@ def HasRDM           : Predicate<"Subtarget->hasRDM()">,
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
 def HasNoFullFP16    : Predicate<"!Subtarget->hasFullFP16()">;
-def HasNoNEON        : Predicate<"!Subtarget->isNeonAvailable()">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,

>From 62541448778e4d2112d249b368ca29ba52a582b1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 19:36:18 +0000
Subject: [PATCH 7/7] Rewrite as DAG combine

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 56 +++++++++++++++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  4 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 44 ---------------
 3 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 51bb358fff6e9..33678b7573c4a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23886,6 +23886,20 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
   return Chain;
 }
 
+static int getFPSubregForVT(EVT VT) {
+  assert(VT.isSimple() && "Expected simple VT");
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f16:
+    return AArch64::hsub;
+  case MVT::f32:
+    return AArch64::ssub;
+  case MVT::f64:
+    return AArch64::dsub;
+  default:
+    llvm_unreachable("Unexpected VT!");
+  }
+}
+
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
@@ -23946,17 +23960,49 @@ static SDValue performSTORECombine(SDNode *N,
   if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
     return Store;
 
-  if (ST->isTruncatingStore()) {
-    EVT StoreVT = ST->getMemoryVT();
-    if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
-      return SDValue();
+  if (ST->isTruncatingStore() &&
+      isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
     if (SDValue Rshrnb =
             trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
       return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
-                               StoreVT, ST->getMemOperand());
+                               MemVT, ST->getMemOperand());
     }
   }
 
+  // This is an integer vector_extract_elt followed by a (possibly truncating)
+  // store. We may be able to replace this with a store of an FP subregister.
+  if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
+      Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Vector = Value.getOperand(0);
+    SDValue ExtIdx = Value.getOperand(1);
+    EVT VectorVT = Vector.getValueType();
+    EVT ElemVT = VectorVT.getVectorElementType();
+    // TODO: Consider allowing Neon (a lot of churn, not necessarily better).
+    if (!VectorVT.isScalableVector())
+      return SDValue();
+    if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
+      return SDValue();
+    if (ValueVT != MemVT && !ST->isTruncatingStore())
+      return SDValue();
+
+    EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
+    EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
+    SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
+    SDValue Ext =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
+
+    EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
+    if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
+      SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
+                                                 FPMemVT, Ext);
+      return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
+                          ST->getMemOperand());
+    }
+
+    return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
+                        ST->getMemOperand());
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3bfc1a922357a..c836f3138a45f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4581,6 +4581,8 @@ let Predicates = [IsLE] in {
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
+} // AddedComplexity = 10
+
 // unscaled i64 truncating stores
 def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
   (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4589,8 +4591,6 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
 def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
   (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 
-} // AddedComplexity = 10
-
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
                              ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 72feca3e0e18e..7100fb4ea037d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1820,52 +1820,8 @@ let Predicates = [HasSVE] in {
   defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
 } // End HasSVE
 
-multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
-                              ValueType VTy, ValueType STy,
-                              ValueType SubRegTy,
-                              SubRegIndex SubRegIdx, Operand IndexType,
-                              Instruction STR,
-                              Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
-  // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
-  def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
-                     (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
-            (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
-                  GPR64sp:$Rn, IndexType:$offset)>;
-
-  // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP.
-  // Non-zero immediate index:
-  def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
-                     (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
-            (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
-                  GPR64sp:$Rn, IndexType:$offset)>;
-}
-
 let Predicates = [HasSVE_or_SME] in {
 
-  let AddedComplexity = 19 in {
-    // Lane 0 truncating stores
-    // i32 -> i16
-    defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-    defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-    // i64 -> i32
-    defm : SVEVecStoreLanePat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-    defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-    // i64 -> i16
-    defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-    defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-    // i16 -> i16 (technically a truncate as the extracted type is i32)
-    defm : SVEVecStoreLanePat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
-    defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
-
-    // Lane 0 stores
-    // i32
-    defm : SVEVecStoreLanePat<am_indexed32,  store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-    defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
-    // i64
-    defm : SVEVecStoreLanePat<am_indexed64,  store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-    defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
-  }
-
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;



More information about the llvm-commits mailing list