[llvm] [AArch64] Improve post-inc stores of SIMD/FP values (PR #151372)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 18 04:05:30 PST 2026
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/151372
>From ead75632271fd778ed89021d003748c0abadfa8f Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 18 Feb 2026 13:34:52 +0200
Subject: [PATCH] [AArch64] Improve post-inc stores of SIMD/FP values
Add patterns to match post-increment truncating stores from lane 0 of
wide integer vectors (v4i32/v2i64) to narrower types (i8/i16/i32).
This avoids transferring the value through a GPR when storing.
Also remove the pre-legalization early-exit in combineStoreValueFPToInt
as it prevented the optimization from applying in some cases.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 3 -
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 +
.../CodeGen/AArch64/store-float-conversion.ll | 260 ++++++++++++++++++
llvm/test/CodeGen/AArch64/tbl-loops.ll | 3 +-
4 files changed, 268 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 66c22db0491d1..056160de318bc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25624,9 +25624,6 @@ static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
- // Limit to post-legalization in order to avoid peeling truncating stores.
- if (DCI.isBeforeLegalize())
- return SDValue();
if (!Subtarget->isNeonAvailable())
return SDValue();
// Source operand is already a vector.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a70f0e2f5da0c..42a1fbfe0cb70 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9934,6 +9934,13 @@ defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
defm : St1LanePost128Pat<post_store, VectorIndexH, v8bf16, bf16, ST1i16_POST, 2>;
+// Truncating post-inc stores from lane 0 of v4i32/v2i64.
+defm : St1LanePost128Pat<post_truncsti8, VectorIndex0, v4i32, i32, ST1i8_POST, 1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndex0, v4i32, i32, ST1i16_POST, 2>;
+defm : St1LanePost128Pat<post_truncsti8, VectorIndex0, v2i64, i64, ST1i8_POST, 1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndex0, v2i64, i64, ST1i16_POST, 2>;
+defm : St1LanePost128Pat<post_truncsti32, VectorIndex0, v2i64, i64, ST1i32_POST, 4>;
+
let mayStore = 1, hasSideEffects = 0 in {
defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
index c46801fc16714..bccbf489601aa 100644
--- a/llvm/test/CodeGen/AArch64/store-float-conversion.ll
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -27,6 +27,34 @@ entry:
ret void
}
+define ptr @f32_to_s8_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: st1 { v0.b }[0], [x0], #1
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ %next = getelementptr i8, ptr %dst, i64 1
+ store i8 %trunc, ptr %dst
+ ret ptr %next
+}
+
+define ptr @f32_to_u8_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: st1 { v0.b }[0], [x0], #1
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i8
+ %next = getelementptr i8, ptr %dst, i64 1
+ store i8 %trunc, ptr %dst
+ ret ptr %next
+}
+
define void @f32_to_u16(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u16:
; CHECK: // %bb.0: // %entry
@@ -53,6 +81,34 @@ entry:
ret void
}
+define ptr @f32_to_s16_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: st1 { v0.h }[0], [x0], #2
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ %next = getelementptr i16, ptr %dst, i64 1
+ store i16 %trunc, ptr %dst
+ ret ptr %next
+}
+
+define ptr @f32_to_u16_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: st1 { v0.h }[0], [x0], #2
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i16
+ %next = getelementptr i16, ptr %dst, i64 1
+ store i16 %trunc, ptr %dst
+ ret ptr %next
+}
+
define void @f32_to_u32(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u32:
; CHECK: // %bb.0: // %entry
@@ -77,6 +133,32 @@ entry:
ret void
}
+define ptr @f32_to_s32_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: st1 { v0.s }[0], [x0], #4
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %next = getelementptr i32, ptr %dst, i64 1
+ store i32 %conv, ptr %dst
+ ret ptr %next
+}
+
+define ptr @f32_to_u32_inc(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: st1 { v0.s }[0], [x0], #4
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %next = getelementptr i32, ptr %dst, i64 1
+ store i32 %conv, ptr %dst
+ ret ptr %next
+}
+
define void @f32_to_s64(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s64:
; CHECK: // %bb.0: // %entry
@@ -115,6 +197,170 @@ entry:
ret void
}
+define ptr @f64_to_s64_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.d }[0], [x0], #8
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ %next = getelementptr i64, ptr %dst, i64 1
+ store i64 %conv, ptr %dst
+ ret ptr %next
+}
+
+define ptr @f64_to_u64_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64_inc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: st1 { v0.d }[0], [x0], #8
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui double %d to i64
+ %next = getelementptr i64, ptr %dst, i64 1
+ store i64 %conv, ptr %dst
+ ret ptr %next
+}
+
+define void @f64_to_u8(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f64_to_s8(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define ptr @f64_to_s8_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s8_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.b }[0], [x0], #1
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ %next = getelementptr i8, ptr %dst, i64 1
+ ret ptr %next
+}
+
+define ptr @f64_to_u8_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u8_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: st1 { v0.b }[0], [x0], #1
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i8
+ store i8 %trunc, ptr %dst
+ %next = getelementptr i8, ptr %dst, i64 1
+ ret ptr %next
+}
+
+define void @f64_to_u16(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f64_to_s16(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define ptr @f64_to_s16_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s16_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.h }[0], [x0], #2
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i16
+ %next = getelementptr i16, ptr %dst, i64 1
+ store i16 %trunc, ptr %dst
+ ret ptr %next
+}
+
+define ptr @f64_to_u16_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u16_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: st1 { v0.h }[0], [x0], #2
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i16
+ %next = getelementptr i16, ptr %dst, i64 1
+ store i16 %trunc, ptr %dst
+ ret ptr %next
+}
+
+define void @f64_to_s32(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i32
+ store i32 %trunc, ptr %dst
+ ret void
+}
+
+define ptr @f64_to_s32_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s32_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: st1 { v0.s }[0], [x0], #4
+; CHECK-NEXT: ret
+ %conv = fptosi double %d to i64
+ %trunc = trunc i64 %conv to i32
+ %next = getelementptr i32, ptr %dst, i64 1
+ store i32 %trunc, ptr %dst
+ ret ptr %next
+}
+
+define ptr @f64_to_u32_inc(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u32_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: st1 { v0.s }[0], [x0], #4
+; CHECK-NEXT: ret
+ %conv = fptoui double %d to i64
+ %trunc = trunc i64 %conv to i32
+ %next = getelementptr i32, ptr %dst, i64 1
+ store i32 %trunc, ptr %dst
+ ret ptr %next
+}
+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_i32_multiple_uses:
; CHECK: // %bb.0: // %entry
@@ -129,3 +375,17 @@ entry:
store i8 %trunc, ptr %dst
ret i32 %conv
}
+
+; Negative test: extracting from lane 1 must go through GPR.
+define ptr @v4i32_lane1_to_i8_inc(<4 x i32> %v, ptr %dst) {
+; CHECK-LABEL: v4i32_lane1_to_i8_inc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: strb w8, [x0], #1
+; CHECK-NEXT: ret
+ %elt = extractelement <4 x i32> %v, i32 1
+ %trunc = trunc i32 %elt to i8
+ store i8 %trunc, ptr %dst
+ %next = getelementptr i8, ptr %dst, i64 1
+ ret ptr %next
+}
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 0f629971b5844..84af8596a0e99 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -64,8 +64,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcsel s2, s0, s3, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: fcvtzs s2, s2
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: strb w11, [x9], #1
+; CHECK-NEXT: st1 { v2.b }[0], [x9], #1
; CHECK-NEXT: b.ne .LBB0_7
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list