[llvm] [AArch64] Keep floating-point conversion in SIMD (PR #147707)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 09:28:02 PDT 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/147707
>From 611070bf0eb669c22350d1346c42644574da2194 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Thu, 17 Jul 2025 21:20:14 +0300
Subject: [PATCH] [AArch64] Keep floating-point conversion in SIMD
---
.../Target/AArch64/AArch64ISelLowering.cpp | 65 +++++++++
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 9 ++
llvm/test/CodeGen/AArch64/selectopt-const.ll | 4 +-
.../CodeGen/AArch64/store-float-conversion.ll | 131 ++++++++++++++++++
llvm/test/CodeGen/AArch64/tbl-loops.ll | 42 +++---
5 files changed, 228 insertions(+), 23 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/store-float-conversion.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c9fc67bb0119..8d580803eea61 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24112,6 +24112,68 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
Store->getMemOperand());
}
+// Combine store (fp_to_int X) to use vector semantics around the conversion
+// when NEON is available. This allows us to store the in-vector result directly
+// without transferring the result into a GPR in the process.
+static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ // Limit to post-legalization in order to avoid peeling truncating stores.
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+ // Source operand is already a vector.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType().isVector())
+ return SDValue();
+
+ // Look through potential assertions.
+ while (Value->isAssert())
+ Value = Value.getOperand(0);
+
+ if (Value.getOpcode() != ISD::FP_TO_SINT &&
+ Value.getOpcode() != ISD::FP_TO_UINT)
+ return SDValue();
+ if (!Value->hasOneUse())
+ return SDValue();
+
+ SDValue FPSrc = Value.getOperand(0);
+ EVT SrcVT = FPSrc.getValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // No support for assignments such as i64 = fp_to_sint i32
+ EVT VT = Value.getSimpleValueType();
+ if (VT != SrcVT.changeTypeToInteger())
+ return SDValue();
+
+ // Create a 128-bit element vector to avoid widening. The floating point
+ // conversion is transformed into a single element conversion via a pattern.
+ unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
+ EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
+ EVT VecDstVT = VecSrcVT.changeTypeToInteger();
+ SDLoc DL(ST);
+ SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
+ SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
+
+ if (ST->isTruncatingStore()) {
+ EVT NewVecDstVT = EVT::getVectorVT(
+ *DAG.getContext(), ST->getMemoryVT(),
+ VecDstVT.getFixedSizeInBits() /
+ ST->getMemoryVT().getFixedSizeInBits());
+ VecConv = DAG.getNode(AArch64ISD::NVCAST, DL, NewVecDstVT, VecConv);
+ }
+
+ SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
+
+ DCI.CombineTo(ST->getValue().getNode(), Extracted);
+ return SDValue(ST, 0);
+}
+
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24194,6 +24256,9 @@ static SDValue performSTORECombine(SDNode *N,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(ST);
+ if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
+ return Res;
+
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
EVT EltVT = VT.getVectorElementType();
return EltVT == MVT::f32 || EltVT == MVT::f64;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 07cacfa7cf394..251fd44b6ea31 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
+def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
+ (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
+def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
+def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
+ (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
+
// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index a44c746e0f281..fe48dbaf1ab76 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-NEXT: csel x10, x9, xzr, lt
; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: ldr s3, [x4, x10]
-; CHECK-NEXT: fcvtzs w10, s3
-; CHECK-NEXT: str w10, [x2], #4
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
; CHECK-NEXT: b.ne .LBB0_2
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
; CHECK-NEXT: mov w0, wzr
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
new file mode 100644
index 0000000000000..c46801fc16714
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+
+define void @f32_to_u8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s64(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: sxtw x8, w8
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %ext = sext i32 %conv to i64
+ store i64 %ext, ptr %dst
+ ret void
+}
+
+define void @f64_to_u64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_s64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_i32_multiple_uses:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: strb w8, [x9]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret i32 %conv
+}
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index aa0a163b96ac8..223698ba225a8 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -63,8 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s2, #0.0
; CHECK-NEXT: fcsel s2, s0, s3, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: fcvtzs w11, s2
-; CHECK-NEXT: strb w11, [x9], #1
+; CHECK-NEXT: fcvtzs s2, s2
+; CHECK-NEXT: st1 { v2.b }[0], [x9], #1
; CHECK-NEXT: b.ne .LBB0_7
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
; CHECK-NEXT: ret
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: fcsel s3, s0, s4, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w11, [x9]
-; CHECK-NEXT: fcvtzs w12, s3
-; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: str b2, [x9]
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: stur b3, [x9, #1]
; CHECK-NEXT: add x9, x9, #2
; CHECK-NEXT: b.ne .LBB1_6
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldr s3, [x8, #8]
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: add x8, x8, #12
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
-; CHECK-NEXT: strb w11, [x9]
+; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcsel s5, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s3, s0, s5, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w12, [x9, #1]
-; CHECK-NEXT: fcvtzs w13, s3
-; CHECK-NEXT: strb w13, [x9, #2]
+; CHECK-NEXT: stur b4, [x9, #1]
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: stur b3, [x9, #2]
; CHECK-NEXT: add x9, x9, #3
; CHECK-NEXT: b.ne .LBB2_8
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: ldp s3, s5, [x8, #8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
-; CHECK-NEXT: strb w11, [x9]
-; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: str b2, [x9]
+; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s6, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcsel s3, s0, s6, mi
; CHECK-NEXT: fcmp s5, s1
-; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: stur b4, [x9, #1]
; CHECK-NEXT: fcsel s6, s1, s5, gt
; CHECK-NEXT: fcmp s5, #0.0
-; CHECK-NEXT: fcvtzs w13, s3
-; CHECK-NEXT: fcsel s2, s0, s6, mi
+; CHECK-NEXT: fcvtzs s3, s3
+; CHECK-NEXT: fcsel s5, s0, s6, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w13, [x9, #2]
-; CHECK-NEXT: fcvtzs w14, s2
-; CHECK-NEXT: strb w14, [x9, #3]
+; CHECK-NEXT: stur b3, [x9, #2]
+; CHECK-NEXT: fcvtzs s5, s5
+; CHECK-NEXT: stur b5, [x9, #3]
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup
More information about the llvm-commits
mailing list