[llvm] [AArch64] Keep floating-point conversion in SIMD (PR #147707)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 13 00:11:11 PDT 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/147707
>From 58e48af7489de6b8e9c53b6dbf4b55ad12633495 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 8 Jul 2025 19:36:14 +0300
Subject: [PATCH] [AArch64] Keep floating-point conversion in SIMD
Stores can be issued faster if the result is kept in the SIMD/FP registers.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 45 +++++++
.../CodeGen/AArch64/store-float-conversion.ll | 117 ++++++++++++++++++
llvm/test/CodeGen/AArch64/tbl-loops.ll | 38 +++---
3 files changed, 181 insertions(+), 19 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/store-float-conversion.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9a..19e27ef85bee8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6612,6 +6612,19 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+let HasOneUse = 1 in {
+def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>;
+def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>;
+}
+
+class TruncStoreMaybeAssert<PatFrag op> : PatFrags<(ops node:$val, node:$ptr),
+ [(op node:$val, node:$ptr),
+ (op (assertzext node:$val), node:$ptr),
+ (op (assertsext node:$val), node:$ptr)]>;
+
+def truncstorei8_maybe_assert : TruncStoreMaybeAssert<truncstorei8>;
+def truncstorei16_maybe_assert : TruncStoreMaybeAssert<truncstorei16>;
+
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
@@ -6646,6 +6659,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+// float -> int conversion followed by a store should use the value in the first
+// lane to avoid expensive fpr -> gpr transfers.
+let AddedComplexity = 10 in {
+// f32 -> i8
+def : Pat<(truncstorei8_maybe_assert (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)),
+ GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(truncstorei8_maybe_assert (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)),
+ GPR64sp:$Rn, uimm12s1:$offset)>;
+
+// f32 -> i16
+def : Pat<(truncstorei16_maybe_assert (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+ (STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)),
+ GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei16_maybe_assert (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+ (STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)),
+ GPR64sp:$Rn, uimm12s2:$offset)>;
+
+// f32 -> i32
+def : Pat<(store (i32 (fp_to_uint_oneuse f32:$src)), (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(store (i32 (fp_to_sint_oneuse f32:$src)), (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, uimm12s4:$offset)>;
+
+// f64 -> i64
+def : Pat<(store (i64 (fp_to_uint_oneuse f64:$src)), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (i64 (fp_to_sint_oneuse f64:$src)), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, uimm12s8:$offset)>;
+} // AddedComplexity = 10
+
// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
new file mode 100644
index 0000000000000..ca12fcb1dcc1b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+
+define void @f32_to_u8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_u64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_s64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_i32_multiple_uses:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: strb w8, [x9]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret i32 %conv
+}
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index aa0a163b96ac8..ecbce4d315036 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: fcsel s3, s0, s4, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w11, [x9]
-; CHECK-NEXT: fcvtzs w12, s3
-; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: str b2, [x9]
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: str b3, [x9, #1]
; CHECK-NEXT: add x9, x9, #2
; CHECK-NEXT: b.ne .LBB1_6
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldr s3, [x8, #8]
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: add x8, x8, #12
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
-; CHECK-NEXT: strb w11, [x9]
+; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcsel s5, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s3, s0, s5, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w12, [x9, #1]
-; CHECK-NEXT: fcvtzs w13, s3
-; CHECK-NEXT: strb w13, [x9, #2]
+; CHECK-NEXT: str b4, [x9, #1]
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: str b3, [x9, #2]
; CHECK-NEXT: add x9, x9, #3
; CHECK-NEXT: b.ne .LBB2_8
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: ldp s3, s5, [x8, #8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
-; CHECK-NEXT: strb w11, [x9]
-; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: str b2, [x9]
+; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s6, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcsel s3, s0, s6, mi
; CHECK-NEXT: fcmp s5, s1
-; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: str b4, [x9, #1]
; CHECK-NEXT: fcsel s6, s1, s5, gt
; CHECK-NEXT: fcmp s5, #0.0
-; CHECK-NEXT: fcvtzs w13, s3
-; CHECK-NEXT: fcsel s2, s0, s6, mi
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: fcsel s5, s0, s6, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w13, [x9, #2]
-; CHECK-NEXT: fcvtzs w14, s2
-; CHECK-NEXT: strb w14, [x9, #3]
+; CHECK-NEXT: str b3, [x9, #2]
+; CHECK-NEXT: fcvtzs s5, s5
+; CHECK-NEXT: str b5, [x9, #3]
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup
More information about the llvm-commits
mailing list