[llvm] [AArch64][SVE] Fold integer lane extract and store to FPR store (PR #129756)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 18 02:53:59 PDT 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/129756
>From 58842de36b7003a08ca49b7320394c0146cbc8d1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 4 Mar 2025 16:18:34 +0000
Subject: [PATCH 01/11] [AArch64][SVE] Fold integer lane 0 extract and store to
FPR store
This helps avoid some pointless fmovs to GPRs, which may be slow in
streaming mode.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 32 ++
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 328 ++++++++++++++++++
...plex-deinterleaving-reductions-scalable.ll | 5 +-
...sve-streaming-mode-fixed-length-bitcast.ll | 3 +-
...e-streaming-mode-fixed-length-ext-loads.ll | 32 +-
...-streaming-mode-fixed-length-ld2-alloca.ll | 11 +-
...mode-fixed-length-masked-gather-scatter.ll | 5 +-
...eaming-mode-fixed-length-optimize-ptrue.ll | 3 +-
.../sve-streaming-mode-fixed-length-stores.ll | 3 +-
10 files changed, 390 insertions(+), 36 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c61e3a613f6f..92a4890372025 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4579,8 +4579,6 @@ let Predicates = [IsLE] in {
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
-} // AddedComplexity = 10
-
// unscaled i64 truncating stores
def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4589,6 +4587,8 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+} // AddedComplexity = 10
+
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3ee71c14c6bd4..d61afeccb09d1 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1995,6 +1995,38 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
(UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
+ // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+ multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ ValueType SubRegTy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR> {
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+ }
+
+ let AddedComplexity = 19 in {
+ // Lane 0 truncating stores
+ // i32 -> i16
+ defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi>;
+ // i64 -> i32
+ defm : SVEVecStoreLane0Pat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi>;
+ // i64 -> i16
+ defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi>;
+ // i16 -> i16 (technically a truncate as the extracted type is i32)
+ defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
+
+ // Lane 0 stores
+ defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
+ defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+ }
+
// Insert subvectors into FP SVE vectors.
foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in
foreach idx = [0, 2] in
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
new file mode 100644
index 0000000000000..22b136ac194cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; TODO: Improve codegen for non-zero extract indices.
+
+define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: str w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: str w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ store i32 %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane0_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ store i32 %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT: fmov x8, d0
+; STREAMING-COMPAT-NEXT: str x8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 1
+ store i64 %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane0_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane0_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 0
+ store i64 %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x float> %b, i32 3
+ store float %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane0_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane0_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x float> %b, i32 0
+ store float %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x double> %b, i32 1
+ store double %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane0_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane0_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x double> %b, i32 0
+ store double %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umov w8, v0.b[7]
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: strb w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 16 x i8> %b, i32 7
+ store i8 %0, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umov w8, v0.h[3]
+; CHECK-NEXT: strh w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s16:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: strh w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 8 x i16> %b, i32 3
+ store i16 %0, ptr %a, align 2
+ ret void
+}
+
+define void @test_str_lane0_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane0_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s16:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str h0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 8 x i16> %b, i32 0
+ store i16 %0, ptr %a, align 2
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i32(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i32
+ store i32 %trunc, ptr %ptr, align 4
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i64(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ store i64 %reduce, ptr %ptr, align 8
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str h0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i16
+ store i16 %trunc, ptr %ptr, align 2
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur s0, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i32
+ %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ store i32 %trunc, ptr %out_ptr, align 4
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: stur x8, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: fmov x8, d0
+; STREAMING-COMPAT-NEXT: stur x8, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ store i64 %reduce, ptr %out_ptr, align 8
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur h0, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i16
+ %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ store i16 %trunc, ptr %out_ptr, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 668dc18df6a0b..89f790210e193 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -332,15 +332,14 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: add z2.d, z5.d, z2.d
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: uaddv d2, p0, z2.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d
+; CHECK-NEXT: uaddv d2, p0, z2.d
; CHECK-NEXT: faddv d0, p0, z3.d
-; CHECK-NEXT: fmov x8, d2
; CHECK-NEXT: faddv d1, p0, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: str s2, [x4]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1
-; CHECK-NEXT: str w8, [x4]
; CHECK-NEXT: ret
entry:
%0 = tail call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 6644be11a02ba..ffef6f74f2d36 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -95,8 +95,7 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z1.s, z0.s[1]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 7d6336a43a4fd..9e1d342663f0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
; CHECK-NEXT: sunpklo z1.d, z0.s
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: mov z2.d, z1.d[1]
; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: mov z1.d, z1.d[1]
+; CHECK-NEXT: str d1, [x8]
+; CHECK-NEXT: str d0, [x8, #64]
+; CHECK-NEXT: fmov x10, d2
; CHECK-NEXT: fmov x11, d0
; CHECK-NEXT: mov z0.d, z0.d[1]
-; CHECK-NEXT: asr x10, x9, #63
-; CHECK-NEXT: stp x9, x10, [x8]
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: asr x12, x11, #63
-; CHECK-NEXT: stp x10, x10, [x8, #16]
-; CHECK-NEXT: stp x11, x12, [x8, #64]
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: asr x10, x9, #63
-; CHECK-NEXT: stp x12, x12, [x8, #80]
-; CHECK-NEXT: stp x10, x10, [x8, #48]
-; CHECK-NEXT: asr x12, x11, #63
-; CHECK-NEXT: stp x9, x10, [x8, #32]
-; CHECK-NEXT: stp x12, x12, [x8, #112]
-; CHECK-NEXT: stp x11, x12, [x8, #96]
+; CHECK-NEXT: asr x9, x9, #63
+; CHECK-NEXT: stp x9, x9, [x8, #8]
+; CHECK-NEXT: asr x11, x11, #63
+; CHECK-NEXT: stp x9, x10, [x8, #24]
+; CHECK-NEXT: asr x9, x10, #63
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: stp x11, x11, [x8, #72]
+; CHECK-NEXT: stp x9, x9, [x8, #48]
+; CHECK-NEXT: str x9, [x8, #40]
+; CHECK-NEXT: asr x9, x10, #63
+; CHECK-NEXT: stp x11, x10, [x8, #88]
+; CHECK-NEXT: stp x9, x9, [x8, #112]
+; CHECK-NEXT: str x9, [x8, #104]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 613543310f2c3..aa1adfd306a4c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -75,8 +75,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: strb w8, [x19, #2]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [x19]
+; CHECK-NEXT: str h1, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
@@ -120,14 +119,12 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: bl def
; CHECK-NEXT: adrp x8, .LCPI2_0
-; CHECK-NEXT: ldr q0, [sp]
+; CHECK-NEXT: ldp q0, q2, [sp]
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: strb w8, [x19, #8]
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: str x8, [x19]
+; CHECK-NEXT: str d0, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index c8cea6ebabd48..434e24bf48724 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -121,9 +121,8 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_3: // %cond.store
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: str x9, [x10]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: str d0, [x9]
; CHECK-NEXT: tbz w8, #1, .LBB1_2
; CHECK-NEXT: .LBB1_4: // %cond.store1
; CHECK-NEXT: mov z0.d, z0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index 431c5a78202e8..74e5fe7352cfd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -769,8 +769,7 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr s1, [x1]
; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str w8, [x0]
+; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: fadd_v2f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index b8779991dbb45..17579d79896da 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -93,8 +93,7 @@ define void @store_v2f16(ptr %a) {
; CHECK-LABEL: store_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.h, #0 // =0x0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str w8, [x0]
+; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: store_v2f16:
>From 2b54fe290db9676dd3e96503287da52a3c8d87f5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 5 Mar 2025 10:17:01 +0000
Subject: [PATCH 02/11] Add missing folds
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +++--
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 34 ++++++++++++++-----
2 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d61afeccb09d1..49fd743cc65b4 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2023,8 +2023,12 @@ let Predicates = [HasSVE_or_SME] in {
defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
// Lane 0 stores
- defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
- defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+ // i32
+ defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi>;
+ // i64
+ defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi>;
}
// Insert subvectors into FP SVE vectors.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index 22b136ac194cc..c2bd513634b44 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -44,6 +44,24 @@ entry:
ret void
}
+define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur s0, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
+ store i32 %0, ptr %out_ptr, align 4
+ ret void
+}
+
define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_str_lane_s64:
; CHECK: // %bb.0: // %entry
@@ -281,7 +299,7 @@ define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i32
- %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ %out_ptr = getelementptr inbounds i32, ptr %ptr, i64 -8
store i32 %trunc, ptr %out_ptr, align 4
ret void
}
@@ -290,19 +308,17 @@ define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4
; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: stur x8, [x0, #-32]
+; CHECK-NEXT: stur d0, [x0, #-64]
; CHECK-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
; STREAMING-COMPAT: // %bb.0:
; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: fmov x8, d0
-; STREAMING-COMPAT-NEXT: stur x8, [x0, #-32]
+; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64]
; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
- %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ %out_ptr = getelementptr inbounds i64, ptr %ptr, i64 -8
store i64 %reduce, ptr %out_ptr, align 8
ret void
}
@@ -311,18 +327,18 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: stur h0, [x0, #-32]
+; CHECK-NEXT: stur h0, [x0, #-16]
; CHECK-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
; STREAMING-COMPAT: // %bb.0:
; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: stur h0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i16
- %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ %out_ptr = getelementptr inbounds i16, ptr %ptr, i64 -8
store i16 %trunc, ptr %out_ptr, align 2
ret void
}
>From 8a9bc1cb7cdc2b38edb33c42b991a866bfd515f3 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 5 Mar 2025 13:43:02 +0000
Subject: [PATCH 03/11] Handle a few more cases + more tests
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 +
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 84 +++---
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 281 ++++++++++++++++--
...e-streaming-mode-fixed-length-ext-loads.ll | 28 +-
...mode-fixed-length-masked-gather-scatter.ll | 7 +-
5 files changed, 322 insertions(+), 79 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 92a4890372025..d374c1007dbe7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -134,6 +134,7 @@ def HasRDM : Predicate<"Subtarget->hasRDM()">,
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
def HasNoFullFP16 : Predicate<"!Subtarget->hasFullFP16()">;
+def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 49fd743cc65b4..00d1ea3bf6432 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1827,6 +1827,54 @@ let Predicates = [HasSVE] in {
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
} // End HasSVE
+multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ ValueType SubRegTy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR,
+ Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
+ let Predicates = [HasSVE_or_SME] in {
+ // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+ }
+
+ // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP.
+ let Predicates = [HasSVE_or_SME, HasNoNEON] in {
+ // Non-zero immediate index:
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+ }
+}
+
+let AddedComplexity = 19 in {
+ // Lane 0 truncating stores
+ // i32 -> i16
+ defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ // i64 -> i32
+ defm : SVEVecStoreLanePat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // i64 -> i16
+ defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // i16 -> i16 (technically a truncate as the extracted type is i32)
+ defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+ defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+
+ // Lane 0 stores
+ // i32
+ defm : SVEVecStoreLanePat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ // i64
+ defm : SVEVecStoreLanePat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+}
+
let Predicates = [HasSVE_or_SME] in {
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
@@ -1995,42 +2043,6 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
(UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
- // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
- multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
- ValueType VTy, ValueType STy,
- ValueType SubRegTy,
- SubRegIndex SubRegIdx, Operand IndexType,
- Instruction STR> {
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
- }
-
- let AddedComplexity = 19 in {
- // Lane 0 truncating stores
- // i32 -> i16
- defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui>;
- defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi>;
- // i64 -> i32
- defm : SVEVecStoreLane0Pat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui>;
- defm : SVEVecStoreLane0Pat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi>;
- // i64 -> i16
- defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui>;
- defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi>;
- // i16 -> i16 (technically a truncate as the extracted type is i32)
- defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui>;
- defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
-
- // Lane 0 stores
- // i32
- defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
- defm : SVEVecStoreLane0Pat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi>;
- // i64
- defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
- defm : SVEVecStoreLane0Pat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi>;
- }
-
// Insert subvectors into FP SVE vectors.
foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in
foreach idx = [0, 2] in
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index c2bd513634b44..7c460f45f7972 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -5,8 +5,6 @@
target triple = "aarch64-unknown-linux-gnu"
-; TODO: Improve codegen for non-zero extract indices.
-
define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -17,8 +15,7 @@ define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
; STREAMING-COMPAT-LABEL: test_str_lane_s32:
; STREAMING-COMPAT: // %bb.0: // %entry
; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: str w8, [x0]
+; STREAMING-COMPAT-NEXT: str s0, [x0]
; STREAMING-COMPAT-NEXT: ret
entry:
@@ -44,24 +41,6 @@ entry:
ret void
}
-define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_lane0_s32_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stur s0, [x0, #-32]
-; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT: ret
-
-entry:
- %0 = extractelement <vscale x 4 x i32> %b, i32 0
- %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
- store i32 %0, ptr %out_ptr, align 4
- ret void
-}
-
define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_str_lane_s64:
; CHECK: // %bb.0: // %entry
@@ -72,8 +51,7 @@ define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
; STREAMING-COMPAT-LABEL: test_str_lane_s64:
; STREAMING-COMPAT: // %bb.0: // %entry
; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT: fmov x8, d0
-; STREAMING-COMPAT-NEXT: str x8, [x0]
+; STREAMING-COMPAT-NEXT: str d0, [x0]
; STREAMING-COMPAT-NEXT: ret
entry:
@@ -191,6 +169,25 @@ entry:
ret void
}
+define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane0_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: strb w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 16 x i8> %b, i32 0
+ store i8 %0, ptr %a, align 1
+ ret void
+}
+
define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: test_str_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -201,8 +198,7 @@ define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
; STREAMING-COMPAT-LABEL: test_str_lane_s16:
; STREAMING-COMPAT: // %bb.0: // %entry
; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: strh w8, [x0]
+; STREAMING-COMPAT-NEXT: str h0, [x0]
; STREAMING-COMPAT-NEXT: ret
entry:
@@ -342,3 +338,236 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
store i16 %trunc, ptr %out_ptr, align 2
ret void
}
+
+define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane_s32_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: stur w8, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
+ store i32 %0, ptr %out_ptr, align 4
+ ret void
+}
+
+define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur s0, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8
+ store i32 %0, ptr %out_ptr, align 4
+ ret void
+}
+
+define void @test_str_lane_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane_s64_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: stur x8, [x0, #-64]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 1
+ %out_ptr = getelementptr inbounds i64, ptr %a, i64 -8
+ store i64 %0, ptr %out_ptr, align 8
+ ret void
+}
+
+define void @test_str_lane0_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane0_s64_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur d0, [x0, #-64]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s64_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 0
+ %out_ptr = getelementptr inbounds i64, ptr %a, i64 -8
+ store i64 %0, ptr %out_ptr, align 8
+ ret void
+}
+
+define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umov w8, v0.b[7]
+; CHECK-NEXT: sturb w8, [x0, #-8]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 16 x i8> %b, i32 7
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %0, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane0_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: sturb w8, [x0, #-8]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 16 x i8> %b, i32 0
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %0, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_lane_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane_s16_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umov w8, v0.h[3]
+; CHECK-NEXT: sturh w8, [x0, #-16]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3]
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 8 x i16> %b, i32 3
+ %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+ store i16 %0, ptr %out_ptr, align 2
+ ret void
+}
+
+define void @test_str_lane0_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane0_s16_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur h0, [x0, #-16]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s16_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 8 x i16> %b, i32 0
+ %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+ store i16 %0, ptr %out_ptr, align 2
+ ret void
+}
+
+define void @test_str_trunc_lane_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: strh w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: str h0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i16
+ store i16 %trunc, ptr %a, align 2
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str h0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i16
+ store i16 %trunc, ptr %a, align 2
+ ret void
+}
+
+define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: sturh w8, [x0, #-16]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i16
+ %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+ store i16 %trunc, ptr %out_ptr, align 2
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur h0, [x0, #-16]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i16
+ %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8
+ store i16 %trunc, ptr %out_ptr, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 9e1d342663f0f..2c891251befc7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
; CHECK-NEXT: sunpklo z1.d, z0.s
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.d, z0.s
-; CHECK-NEXT: mov z2.d, z1.d[1]
; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: mov z2.d, z1.d[1]
; CHECK-NEXT: str d1, [x8]
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: asr x9, x9, #63
+; CHECK-NEXT: mov z1.d, z0.d[1]
; CHECK-NEXT: str d0, [x8, #64]
+; CHECK-NEXT: stp x9, x9, [x8, #16]
+; CHECK-NEXT: str x9, [x8, #8]
+; CHECK-NEXT: asr x9, x10, #63
; CHECK-NEXT: fmov x10, d2
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: str d2, [x8, #32]
+; CHECK-NEXT: stp x9, x9, [x8, #80]
+; CHECK-NEXT: str x9, [x8, #72]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: asr x10, x10, #63
+; CHECK-NEXT: str d1, [x8, #96]
+; CHECK-NEXT: stp x10, x10, [x8, #48]
; CHECK-NEXT: asr x9, x9, #63
-; CHECK-NEXT: stp x9, x9, [x8, #8]
-; CHECK-NEXT: asr x11, x11, #63
-; CHECK-NEXT: stp x9, x10, [x8, #24]
-; CHECK-NEXT: asr x9, x10, #63
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: stp x11, x11, [x8, #72]
-; CHECK-NEXT: stp x9, x9, [x8, #48]
-; CHECK-NEXT: str x9, [x8, #40]
-; CHECK-NEXT: asr x9, x10, #63
-; CHECK-NEXT: stp x11, x10, [x8, #88]
+; CHECK-NEXT: str x10, [x8, #40]
; CHECK-NEXT: stp x9, x9, [x8, #112]
; CHECK-NEXT: str x9, [x8, #104]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index 434e24bf48724..d9f8482a3c503 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -125,11 +125,10 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: str d0, [x9]
; CHECK-NEXT: tbz w8, #1, .LBB1_2
; CHECK-NEXT: .LBB1_4: // %cond.store1
-; CHECK-NEXT: mov z0.d, z0.d[1]
; CHECK-NEXT: mov z1.d, z1.d[1]
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: str x8, [x9]
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
>From 446ae821958a9f6e1d5c17ef6f6f198bbfee736e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 12:10:42 +0000
Subject: [PATCH 04/11] Avoid duplicate test checks
---
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 216 +++++-------------
1 file changed, 53 insertions(+), 163 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index 7c460f45f7972..2278bc82fcf6e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
target triple = "aarch64-unknown-linux-gnu"
define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_lane_s32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: str w8, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s32:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT: str w8, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s32:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -29,11 +29,6 @@ define void @test_str_lane0_s32(ptr %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s32:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: str s0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -42,11 +37,11 @@ entry:
}
define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: test_str_lane_s64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: str x8, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s64:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1]
+; CHECK-NONSTREAMING-NEXT: str x8, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s64:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -65,11 +60,6 @@ define void @test_str_lane0_s64(ptr %a, <vscale x 2 x i64> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s64:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: str d0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 2 x i64> %b, i32 0
@@ -83,12 +73,6 @@ define void @test_str_lane_f32(ptr %a, <vscale x 4 x float> %b) {
; CHECK-NEXT: mov z0.s, z0.s[3]
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_f32:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT: str s0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x float> %b, i32 3
@@ -101,11 +85,6 @@ define void @test_str_lane0_f32(ptr %a, <vscale x 4 x float> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_f32:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: str s0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x float> %b, i32 0
@@ -119,12 +98,6 @@ define void @test_str_lane_f64(ptr %a, <vscale x 2 x double> %b) {
; CHECK-NEXT: mov z0.d, z0.d[1]
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_f64:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT: str d0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 2 x double> %b, i32 1
@@ -137,11 +110,6 @@ define void @test_str_lane0_f64(ptr %a, <vscale x 2 x double> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_f64:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: str d0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 2 x double> %b, i32 0
@@ -150,11 +118,11 @@ entry:
}
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umov w8, v0.b[7]
-; CHECK-NEXT: strb w8, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
+; CHECK-NONSTREAMING-NEXT: strb w8, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s8:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -175,12 +143,6 @@ define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s8:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: strb w8, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
@@ -189,11 +151,11 @@ entry:
}
define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: test_str_lane_s16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: strh w8, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s16:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3]
+; CHECK-NONSTREAMING-NEXT: strh w8, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s16:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -212,11 +174,6 @@ define void @test_str_lane0_s16(ptr %a, <vscale x 8 x i16> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s16:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: str h0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 8 x i16> %b, i32 0
@@ -230,12 +187,6 @@ define void @test_str_reduction_i32_to_i32(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32:
-; STREAMING-COMPAT: // %bb.0:
-; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: str s0, [x0]
-; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i32
@@ -249,12 +200,6 @@ define void @test_str_reduction_i32_to_i64(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64:
-; STREAMING-COMPAT: // %bb.0:
-; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: str d0, [x0]
-; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
store i64 %reduce, ptr %ptr, align 8
@@ -267,12 +212,6 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16:
-; STREAMING-COMPAT: // %bb.0:
-; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: str h0, [x0]
-; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i16
@@ -286,12 +225,6 @@ define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: stur s0, [x0, #-32]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset:
-; STREAMING-COMPAT: // %bb.0:
-; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i32
@@ -306,12 +239,6 @@ define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: stur d0, [x0, #-64]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
-; STREAMING-COMPAT: // %bb.0:
-; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64]
-; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%out_ptr = getelementptr inbounds i64, ptr %ptr, i64 -8
@@ -325,12 +252,6 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: stur h0, [x0, #-16]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
-; STREAMING-COMPAT: // %bb.0:
-; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
-; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i16
@@ -340,11 +261,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
}
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_lane_s32_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: stur w8, [x0, #-32]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT: stur w8, [x0, #-32]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -364,11 +285,6 @@ define void @test_str_lane0_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stur s0, [x0, #-32]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -378,11 +294,11 @@ entry:
}
define void @test_str_lane_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: test_str_lane_s64_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: stur x8, [x0, #-64]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1]
+; CHECK-NONSTREAMING-NEXT: stur x8, [x0, #-64]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -402,11 +318,6 @@ define void @test_str_lane0_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stur d0, [x0, #-64]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s64_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 2 x i64> %b, i32 0
@@ -416,11 +327,11 @@ entry:
}
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane_s8_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umov w8, v0.b[7]
-; CHECK-NEXT: sturb w8, [x0, #-8]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
+; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -442,12 +353,6 @@ define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: sturb w8, [x0, #-8]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
@@ -457,11 +362,11 @@ entry:
}
define void @test_str_lane_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: test_str_lane_s16_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: sturh w8, [x0, #-16]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3]
+; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -481,11 +386,6 @@ define void @test_str_lane0_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stur h0, [x0, #-16]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane0_s16_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 8 x i16> %b, i32 0
@@ -495,11 +395,11 @@ entry:
}
define void @test_str_trunc_lane_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane_s32_to_s16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: strh w8, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT: strh w8, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -519,11 +419,6 @@ define void @test_str_trunc_lane0_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: str h0, [x0]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -533,11 +428,11 @@ entry:
}
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: sturh w8, [x0, #-16]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
+; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16]
+; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
; STREAMING-COMPAT: // %bb.0: // %entry
@@ -558,11 +453,6 @@ define void @test_str_trunc_lane0_s32_to_s16_negative_offset(ptr %a, <vscale x 4
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stur h0, [x0, #-16]
; CHECK-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 0
>From 40b1a948759590530d6798896d6355136d146f18 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 14:45:17 +0000
Subject: [PATCH 05/11] Fixups
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 73 +++++-----
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 128 ++++++------------
2 files changed, 75 insertions(+), 126 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 00d1ea3bf6432..723e01853baed 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1833,49 +1833,46 @@ multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator store
SubRegIndex SubRegIdx, Operand IndexType,
Instruction STR,
Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
- let Predicates = [HasSVE_or_SME] in {
- // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
- }
+ // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
// Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP.
- let Predicates = [HasSVE_or_SME, HasNoNEON] in {
- // Non-zero immediate index:
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
- }
-}
-
-let AddedComplexity = 19 in {
- // Lane 0 truncating stores
- // i32 -> i16
- defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- // i64 -> i32
- defm : SVEVecStoreLanePat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- // i64 -> i16
- defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- // i16 -> i16 (technically a truncate as the extracted type is i32)
- defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
- defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
-
- // Lane 0 stores
- // i32
- defm : SVEVecStoreLanePat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- // i64
- defm : SVEVecStoreLanePat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // Non-zero immediate index:
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
}
let Predicates = [HasSVE_or_SME] in {
+
+ let AddedComplexity = 19 in {
+ // Lane 0 truncating stores
+ // i32 -> i16
+ defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ // i64 -> i32
+ defm : SVEVecStoreLanePat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // i64 -> i16
+ defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // i16 -> i16 (technically a truncate as the extracted type is i32)
+ defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+ defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+
+ // Lane 0 stores
+ // i32
+ defm : SVEVecStoreLanePat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ // i64
+ defm : SVEVecStoreLanePat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ }
+
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index 2278bc82fcf6e..d39c9bf760621 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -6,17 +6,11 @@
target triple = "aarch64-unknown-linux-gnu"
define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s32:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT: str w8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s32:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT: str s0, [x0]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 3
@@ -37,17 +31,11 @@ entry:
}
define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s64:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1]
-; CHECK-NONSTREAMING-NEXT: str x8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s64:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT: str d0, [x0]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 2 x i64> %b, i32 1
@@ -151,17 +139,11 @@ entry:
}
define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s16:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3]
-; CHECK-NONSTREAMING-NEXT: strh w8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s16:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3]
-; STREAMING-COMPAT-NEXT: str h0, [x0]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.h, z0.h[3]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 8 x i16> %b, i32 3
@@ -261,17 +243,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
}
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT: stur w8, [x0, #-32]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s32_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: stur s0, [x0, #-32]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 3
@@ -294,17 +270,11 @@ entry:
}
define void @test_str_lane_s64_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1]
-; CHECK-NONSTREAMING-NEXT: stur x8, [x0, #-64]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
-; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s64_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: stur d0, [x0, #-64]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 2 x i64> %b, i32 1
@@ -362,17 +332,11 @@ entry:
}
define void @test_str_lane_s16_negative_offset(ptr %a, <vscale x 8 x i16> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3]
-; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3]
-; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s16_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.h, z0.h[3]
+; CHECK-NEXT: stur h0, [x0, #-16]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 8 x i16> %b, i32 3
@@ -395,17 +359,11 @@ entry:
}
define void @test_str_trunc_lane_s32_to_s16(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT: strh w8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT: str h0, [x0]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 3
@@ -428,17 +386,11 @@ entry:
}
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3]
-; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
-; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: stur h0, [x0, #-16]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 3
>From 98865833804547ced178f53ddc098536338cbd70 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 14:50:59 +0000
Subject: [PATCH 06/11] Fixups
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d374c1007dbe7..92a4890372025 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -134,7 +134,6 @@ def HasRDM : Predicate<"Subtarget->hasRDM()">,
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
def HasNoFullFP16 : Predicate<"!Subtarget->hasFullFP16()">;
-def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
>From 17f34a65b24d79cce7014f8133f243d4545d5fe2 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 19:36:18 +0000
Subject: [PATCH 07/11] Rewrite as DAG combine
---
.../Target/AArch64/AArch64ISelLowering.cpp | 56 +++++++++++++++++--
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 44 ---------------
3 files changed, 53 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a9b4965e32b4c..61347009eb2ad 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23938,6 +23938,20 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
return Chain;
}
+static int getFPSubregForVT(EVT VT) {
+ assert(VT.isSimple() && "Expected simple VT");
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f16:
+ return AArch64::hsub;
+ case MVT::f32:
+ return AArch64::ssub;
+ case MVT::f64:
+ return AArch64::dsub;
+ default:
+ llvm_unreachable("Unexpected VT!");
+ }
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -23998,17 +24012,49 @@ static SDValue performSTORECombine(SDNode *N,
if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
return Store;
- if (ST->isTruncatingStore()) {
- EVT StoreVT = ST->getMemoryVT();
- if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
- return SDValue();
+ if (ST->isTruncatingStore() &&
+ isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
if (SDValue Rshrnb =
trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
- StoreVT, ST->getMemOperand());
+ MemVT, ST->getMemOperand());
}
}
+ // This is an integer vector_extract_elt followed by a (possibly truncating)
+ // store. We may be able to replace this with a store of an FP subregister.
+ if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
+ Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue Vector = Value.getOperand(0);
+ SDValue ExtIdx = Value.getOperand(1);
+ EVT VectorVT = Vector.getValueType();
+ EVT ElemVT = VectorVT.getVectorElementType();
+ // TODO: Consider allowing Neon (a lot of churn, not necessarily better).
+ if (!VectorVT.isScalableVector())
+ return SDValue();
+ if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
+ return SDValue();
+ if (ValueVT != MemVT && !ST->isTruncatingStore())
+ return SDValue();
+
+ EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
+ EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
+ SDValue Ext =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
+
+ EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
+ if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
+ SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
+ FPMemVT, Ext);
+ return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
+ ST->getMemOperand());
+ }
+
+ return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
+ ST->getMemOperand());
+ }
+
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 92a4890372025..6c61e3a613f6f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4579,6 +4579,8 @@ let Predicates = [IsLE] in {
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
+} // AddedComplexity = 10
+
// unscaled i64 truncating stores
def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4587,8 +4589,6 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
-} // AddedComplexity = 10
-
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 723e01853baed..9ea488e6145ad 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1827,52 +1827,8 @@ let Predicates = [HasSVE] in {
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
} // End HasSVE
-multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
- ValueType VTy, ValueType STy,
- ValueType SubRegTy,
- SubRegIndex SubRegIdx, Operand IndexType,
- Instruction STR,
- Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
- // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
-
- // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP.
- // Non-zero immediate index:
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
-}
-
let Predicates = [HasSVE_or_SME] in {
- let AddedComplexity = 19 in {
- // Lane 0 truncating stores
- // i32 -> i16
- defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- // i64 -> i32
- defm : SVEVecStoreLanePat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- // i64 -> i16
- defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- // i16 -> i16 (technically a truncate as the extracted type is i32)
- defm : SVEVecStoreLanePat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui, DUP_ZZI_H, sve_elm_idx_extdup_h>;
- defm : SVEVecStoreLanePat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi, DUP_ZZI_H, sve_elm_idx_extdup_h>;
-
- // Lane 0 stores
- // i32
- defm : SVEVecStoreLanePat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- defm : SVEVecStoreLanePat<am_unscaled32, store, nxv4i32, i32, f32, ssub, simm9, STURSi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- // i64
- defm : SVEVecStoreLanePat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled64, store, nxv2i64, i64, f64, dsub, simm9, STURDi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- }
-
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
>From 473144fe323e32b1bb8237076edd0a0bcca8163b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 6 Mar 2025 19:41:02 +0000
Subject: [PATCH 08/11] Rm whitespace
---
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 9ea488e6145ad..3ee71c14c6bd4 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1828,7 +1828,6 @@ let Predicates = [HasSVE] in {
} // End HasSVE
let Predicates = [HasSVE_or_SME] in {
-
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
>From 2e7219e25ec9c06b7affc169f2d4fc48eabd6315 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 17 Mar 2025 10:48:43 +0000
Subject: [PATCH 09/11] Fixups
---
.../Target/AArch64/AArch64ISelLowering.cpp | 21 +++++++++---
.../aarch64-neon-vector-insert-uaddlv.ll | 5 ++-
llvm/test/CodeGen/AArch64/add.ll | 7 ++--
llvm/test/CodeGen/AArch64/andorxor.ll | 21 +++++-------
llvm/test/CodeGen/AArch64/bitcast.ll | 14 ++++----
llvm/test/CodeGen/AArch64/mul.ll | 7 ++--
llvm/test/CodeGen/AArch64/neon-rshrn.ll | 7 ++--
llvm/test/CodeGen/AArch64/neon-truncstore.ll | 7 ++--
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 7 ++--
llvm/test/CodeGen/AArch64/shufflevector.ll | 14 ++++----
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 7 ++--
llvm/test/CodeGen/AArch64/store.ll | 23 ++++---------
llvm/test/CodeGen/AArch64/sub.ll | 7 ++--
...e-streaming-mode-fixed-length-ext-loads.ll | 34 ++++++++-----------
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 7 ++--
llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 7 ++--
16 files changed, 89 insertions(+), 106 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 61347009eb2ad..918a65b8132be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23938,7 +23938,7 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
return Chain;
}
-static int getFPSubregForVT(EVT VT) {
+static unsigned getFPSubregForVT(EVT VT) {
assert(VT.isSimple() && "Expected simple VT");
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f16:
@@ -24025,18 +24025,31 @@ static SDValue performSTORECombine(SDNode *N,
// store. We may be able to replace this with a store of an FP subregister.
if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
SDValue Vector = Value.getOperand(0);
SDValue ExtIdx = Value.getOperand(1);
EVT VectorVT = Vector.getValueType();
EVT ElemVT = VectorVT.getVectorElementType();
- // TODO: Consider allowing Neon (a lot of churn, not necessarily better).
- if (!VectorVT.isScalableVector())
- return SDValue();
if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
return SDValue();
if (ValueVT != MemVT && !ST->isTruncatingStore())
return SDValue();
+ // Heuristic: If there are other users of integer scalars extracted from
+ // this vector that won't fold into the store -- abandon folding. This may
+ // extend the vector lifetime and disrupt paired stores.
+ for (auto Use = Vector->use_begin(), End = Vector->use_end(); Use != End;
+ ++Use) {
+ if (Use->getResNo() != Vector.getResNo())
+ continue;
+ SDNode *User = Use->getUser();
+ if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (!User->hasOneUse() ||
+ (*User->user_begin())->getOpcode() != ISD::STORE)
+ return SDValue();
+ }
+ }
+
EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 1b7bc128d6332..b357a24f892ff 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -70,10 +70,10 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: str wzr, [x0, #88]
+; CHECK-NEXT: str xzr, [x0, #80]
; CHECK-NEXT: uaddlv.8h s1, v0
; CHECK-NEXT: stp q0, q0, [x0, #16]
; CHECK-NEXT: stp q0, q0, [x0, #48]
-; CHECK-NEXT: str d0, [x0, #80]
; CHECK-NEXT: mov.s v2[0], v1[0]
; CHECK-NEXT: ucvtf.4s v1, v2
; CHECK-NEXT: str q1, [x0]
@@ -146,11 +146,10 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
+; CHECK-NEXT: str xzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
-; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: str d1, [x0, #16]
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index e3072dc41d933..fc0ba336b21cc 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -232,10 +232,9 @@ define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 5c7429aebb31e..24f2549cce785 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -696,10 +696,9 @@ define void @and_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: and_v2i16:
@@ -733,10 +732,9 @@ define void @or_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_v2i16:
@@ -770,10 +768,9 @@ define void @xor_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: xor_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 39f2572d9fd35..d9199ce2c79de 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -102,10 +102,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
@@ -401,10 +400,9 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr s0, [sp, #12]
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9ca975d9e742e..500379d1cfdec 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -244,10 +244,9 @@ define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
index e648b10ea357b..8fabd7a618f68 100644
--- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll
+++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
@@ -868,10 +868,9 @@ define void @rshrn_v2i32_4(<2 x i32> %a, ptr %p) {
; CHECK-NEXT: movi v1.2s, #8
; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ushr v0.2s, v0.2s, #4
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w9, [x0]
-; CHECK-NEXT: strh w8, [x0, #2]
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: str h1, [x0, #2]
; CHECK-NEXT: ret
entry:
%b = add <2 x i32> %a, <i32 8, i32 8>
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index 5d78ad24eb333..3d3362d314a99 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -42,10 +42,9 @@ define void @v2i32_v2i16(<2 x i32> %a, ptr %result) {
; CHECK-LABEL: v2i32_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w9, [x0]
-; CHECK-NEXT: strh w8, [x0, #2]
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: str h1, [x0, #2]
; CHECK-NEXT: ret
%b = trunc <2 x i32> %a to <2 x i16>
store <2 x i16> %b, ptr %result
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 531562d3aa678..4d76994be204f 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -256,10 +256,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x2]
-; CHECK-SD-NEXT: strh w8, [x2, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x2]
+; CHECK-SD-NEXT: str h1, [x2, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 0f5b240e387ed..4c8f0c9c446f5 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -288,10 +288,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
@@ -499,10 +498,9 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: dup v1.2s, v0.s[0]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: mov w8, v1.s[1]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
+; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: mov s1, v1.s[1]
+; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index be4a5843e8215..ae2a16929e254 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -257,10 +257,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x2]
-; CHECK-SD-NEXT: strh w8, [x2, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x2]
+; CHECK-SD-NEXT: str h1, [x2, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 86d74b69f4958..37a6ad08d4cb3 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -147,22 +147,13 @@ define void @store_v32i8(<32 x i8> %a, ptr %ptr){
}
define void @store_v2i16(<2 x i16> %a, ptr %ptr){
-; CHECK-SD-LABEL: store_v2i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: store_v2i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: str h0, [x0]
-; CHECK-GI-NEXT: str h1, [x0, #2]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: store_v2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: str h1, [x0, #2]
+; CHECK-NEXT: ret
store <2 x i16> %a, ptr %ptr
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 8f35a69f52b85..8183a82f21cb5 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -232,10 +232,9 @@ define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 2c891251befc7..7d6336a43a4fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -269,27 +269,23 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.d, z0.s
; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: mov z2.d, z1.d[1]
-; CHECK-NEXT: str d1, [x8]
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: asr x9, x9, #63
-; CHECK-NEXT: mov z1.d, z0.d[1]
-; CHECK-NEXT: str d0, [x8, #64]
-; CHECK-NEXT: stp x9, x9, [x8, #16]
-; CHECK-NEXT: str x9, [x8, #8]
-; CHECK-NEXT: asr x9, x10, #63
-; CHECK-NEXT: fmov x10, d2
-; CHECK-NEXT: str d2, [x8, #32]
-; CHECK-NEXT: stp x9, x9, [x8, #80]
-; CHECK-NEXT: str x9, [x8, #72]
+; CHECK-NEXT: mov z1.d, z1.d[1]
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: asr x10, x9, #63
+; CHECK-NEXT: stp x9, x10, [x8]
; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: asr x10, x10, #63
-; CHECK-NEXT: str d1, [x8, #96]
+; CHECK-NEXT: asr x12, x11, #63
+; CHECK-NEXT: stp x10, x10, [x8, #16]
+; CHECK-NEXT: stp x11, x12, [x8, #64]
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: asr x10, x9, #63
+; CHECK-NEXT: stp x12, x12, [x8, #80]
; CHECK-NEXT: stp x10, x10, [x8, #48]
-; CHECK-NEXT: asr x9, x9, #63
-; CHECK-NEXT: str x10, [x8, #40]
-; CHECK-NEXT: stp x9, x9, [x8, #112]
-; CHECK-NEXT: str x9, [x8, #104]
+; CHECK-NEXT: asr x12, x11, #63
+; CHECK-NEXT: stp x9, x10, [x8, #32]
+; CHECK-NEXT: stp x12, x12, [x8, #112]
+; CHECK-NEXT: stp x11, x12, [x8, #96]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 924bd3981779e..d0173307bd830 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -255,10 +255,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x2]
-; CHECK-SD-NEXT: strh w8, [x2, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x2]
+; CHECK-SD-NEXT: str h1, [x2, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a623eb554cac7..dc3ebfb0682ca 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -252,10 +252,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v0.s[1], w10
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x2]
-; CHECK-SD-NEXT: strh w8, [x2, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x2]
+; CHECK-SD-NEXT: str h1, [x2, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
>From f64904cb78ec2f9c3ef5a15c2ff46bae490f8394 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 17 Mar 2025 17:47:20 +0000
Subject: [PATCH 10/11] Fixups
---
.../Target/AArch64/AArch64ISelLowering.cpp | 20 +++++++++----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 918a65b8132be..879b83f94b79a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24036,18 +24036,16 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
// Heuristic: If there are other users of integer scalars extracted from
- // this vector that won't fold into the store -- abandon folding. This may
- // extend the vector lifetime and disrupt paired stores.
- for (auto Use = Vector->use_begin(), End = Vector->use_end(); Use != End;
- ++Use) {
- if (Use->getResNo() != Vector.getResNo())
+ // this vector that won't fold into the store -- abandon folding. Applying
+ // this fold may extend the vector lifetime and disrupt paired stores.
+ for (const auto &Use : Vector->uses()) {
+ if (Use.getResNo() != Vector.getResNo())
continue;
- SDNode *User = Use->getUser();
- if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- if (!User->hasOneUse() ||
- (*User->user_begin())->getOpcode() != ISD::STORE)
- return SDValue();
- }
+ const SDNode *User = Use.getUser();
+ if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ (!User->hasOneUse() ||
+ (*User->user_begin())->getOpcode() != ISD::STORE))
+ return SDValue();
}
EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
>From a6897836e206de07584447ca184cd32c3c41ece9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 18 Mar 2025 09:53:01 +0000
Subject: [PATCH 11/11] Rebase: Update tests
---
llvm/test/CodeGen/AArch64/ctlz.ll | 7 +++----
llvm/test/CodeGen/AArch64/ctpop.ll | 7 +++----
llvm/test/CodeGen/AArch64/cttz.ll | 7 +++----
3 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 437e3d5ff75c6..7b8f6cf24f278 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -302,10 +302,9 @@ define void @v2i16(ptr %p1) {
; CHECK-SD-NEXT: mov v1.s[1], w9
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [x0]
-; CHECK-SD-NEXT: strh w8, [x0, #2]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str h0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i16:
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 785a447123b5e..2299b5c5a5af9 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -122,10 +122,9 @@ define void @v2i16(ptr %p1) {
; CHECK-NEXT: cnt v0.8b, v0.8b
; CHECK-NEXT: uaddlp v0.4h, v0.8b
; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w9, [x0]
-; CHECK-NEXT: strh w8, [x0, #2]
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: str h1, [x0, #2]
; CHECK-NEXT: ret
entry:
%d = load <2 x i16>, ptr %p1
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index a254df229c127..9bc0970deeeda 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -164,10 +164,9 @@ define void @v2i16(ptr %p1) {
; CHECK-NEXT: movi v1.2s, #32
; CHECK-NEXT: clz v0.2s, v0.2s
; CHECK-NEXT: sub v0.2s, v1.2s, v0.2s
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strh w9, [x0]
-; CHECK-NEXT: strh w8, [x0, #2]
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: str h1, [x0, #2]
; CHECK-NEXT: ret
entry:
%d = load <2 x i16>, ptr %p1
More information about the llvm-commits
mailing list