[llvm-branch-commits] [llvm] [AArch64] Fold fcvtzu/fcvtzs(uitofp/sitofp(x)) roundtrip (PR #190328)
Valeriy Savchenko via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Apr 3 02:42:59 PDT 2026
https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/190328
>From fa8a303ccbcdfc4cf040442bc7b02e02b35c7b30 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Thu, 2 Apr 2026 22:42:34 +0100
Subject: [PATCH] [AArch64] Fold fcvtzu/fcvtzs(uitofp/sitofp(x)) roundtrip
stack-info: PR: https://github.com/llvm/llvm-project/pull/190328, branch: users/SavchenkoValeriy/feat/instcombine/fcvtzu_fcvtzs_roundtrip/stack/2
---
.../AArch64/AArch64TargetTransformInfo.cpp | 47 ++++
.../AArch64/neon-fcvtz-roundtrip.ll | 210 ++++++++++++++++++
2 files changed, 257 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 734339e5c7a05..7412a7470fa41 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2869,6 +2869,50 @@ static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
return std::nullopt;
}
+// fcvtzu(uitofp(x)) -> x or zext(x)
+// fcvtzs(sitofp(x)) -> x or sext(x)
+//
+// These NEON intrinsics perform saturating conversions, so unlike generic
+// fptoui/fptosi we cannot exploit UB on overflow. We must prove:
+// 1. The int->FP cast is exact (no precision loss)
+// 2. The value fits in the destination integer type
+static std::optional<Instruction *> instCombineNeonFCvtz(InstCombiner &IC,
+ IntrinsicInst &II) {
+ bool IsUnsigned = II.getIntrinsicID() == Intrinsic::aarch64_neon_fcvtzu;
+
+ // Only handle same-sign cases for now.
+ CastInst *IToFP;
+ if (IsUnsigned)
+ IToFP = dyn_cast<UIToFPInst>(II.getArgOperand(0));
+ else
+ IToFP = dyn_cast<SIToFPInst>(II.getArgOperand(0));
+ if (!IToFP)
+ return std::nullopt;
+
+ // Check that int->FP is exact (no rounding).
+ if (!IC.isKnownExactCastIntToFP(*IToFP))
+ return std::nullopt;
+
+ Value *X = IToFP->getOperand(0);
+ Type *SrcIntTy = X->getType();
+ Type *DstIntTy = II.getType();
+ unsigned SrcBits = SrcIntTy->getScalarSizeInBits();
+ unsigned DstBits = DstIntTy->getScalarSizeInBits();
+
+ // Check that the value fits in the destination integer type (no saturation).
+ // TODO: support narrowing with MaxActiveBits analysis.
+ if (SrcBits > DstBits)
+ return std::nullopt;
+
+ if (SrcBits == DstBits)
+ return IC.replaceInstUsesWith(II, X);
+
+ IC.Builder.SetInsertPoint(&II);
+ Value *Ext = IsUnsigned ? IC.Builder.CreateZExt(X, DstIntTy)
+ : IC.Builder.CreateSExt(X, DstIntTy);
+ return IC.replaceInstUsesWith(II, Ext);
+}
+
static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
IntrinsicInst &II) {
Value *Pred = II.getOperand(0);
@@ -3095,6 +3139,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVEUxt(IC, II, 32);
case Intrinsic::aarch64_sme_in_streaming_mode:
return instCombineInStreamingMode(IC, II);
+ case Intrinsic::aarch64_neon_fcvtzu:
+ case Intrinsic::aarch64_neon_fcvtzs:
+ return instCombineNeonFCvtz(IC, II);
}
return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll b/llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll
new file mode 100644
index 0000000000000..c1397ea0fefb8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine -mtriple aarch64 %s | FileCheck %s
+
+define <8 x i16> @fcvtzu_uitofp_zext_i8(<8 x i8> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_zext_i8(
+; CHECK-NEXT: [[CVT:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT: ret <8 x i16> [[CVT]]
+;
+ %zext = zext <8 x i8> %a to <8 x i16>
+ %fp = uitofp <8 x i16> %zext to <8 x half>
+ %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+ ret <8 x i16> %cvt
+}
+
+define <8 x i16> @fcvtzu_uitofp_and_11bits(<8 x i16> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_and_11bits(
+; CHECK-NEXT: [[MASKED:%.*]] = and <8 x i16> [[A:%.*]], splat (i16 2047)
+; CHECK-NEXT: ret <8 x i16> [[MASKED]]
+;
+ %masked = and <8 x i16> %a, splat (i16 2047)
+ %fp = uitofp <8 x i16> %masked to <8 x half>
+ %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+ ret <8 x i16> %cvt
+}
+
+define <4 x i32> @fcvtzs_sitofp_sext_i16(<4 x i16> %a) {
+; CHECK-LABEL: @fcvtzs_sitofp_sext_i16(
+; CHECK-NEXT: [[CVT:%.*]] = sext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> [[CVT]]
+;
+ %sext = sext <4 x i16> %a to <4 x i32>
+ %fp = sitofp <4 x i32> %sext to <4 x float>
+ %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %fp)
+ ret <4 x i32> %cvt
+}
+
+define <4 x i16> @fcvtzu_uitofp_and_255(<4 x i16> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_and_255(
+; CHECK-NEXT: [[MASKED:%.*]] = and <4 x i16> [[A:%.*]], splat (i16 255)
+; CHECK-NEXT: ret <4 x i16> [[MASKED]]
+;
+ %masked = and <4 x i16> %a, splat (i16 255)
+ %fp = uitofp <4 x i16> %masked to <4 x half>
+ %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %fp)
+ ret <4 x i16> %cvt
+}
+
+define <4 x i32> @fcvtzu_uitofp_widen_i16_to_i32(<4 x i16> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_widen_i16_to_i32(
+; CHECK-NEXT: [[CVT:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> [[CVT]]
+;
+ %fp = uitofp <4 x i16> %a to <4 x float>
+ %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %fp)
+ ret <4 x i32> %cvt
+}
+
+define i32 @fcvtzu_uitofp_scalar_widen(i16 %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_scalar_widen(
+; CHECK-NEXT: [[CVT:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT: ret i32 [[CVT]]
+;
+ %fp = uitofp i16 %a to float
+ %cvt = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %fp)
+ ret i32 %cvt
+}
+
+define i32 @fcvtzs_sitofp_scalar_widen(i16 %a) {
+; CHECK-LABEL: @fcvtzs_sitofp_scalar_widen(
+; CHECK-NEXT: [[CVT:%.*]] = sext i16 [[A:%.*]] to i32
+; CHECK-NEXT: ret i32 [[CVT]]
+;
+ %fp = sitofp i16 %a to float
+ %cvt = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %fp)
+ ret i32 %cvt
+}
+
+define <2 x i64> @fcvtzs_sitofp_double_widen(<2 x i32> %a) {
+; CHECK-LABEL: @fcvtzs_sitofp_double_widen(
+; CHECK-NEXT: [[CVT:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[CVT]]
+;
+ %fp = sitofp <2 x i32> %a to <2 x double>
+ %cvt = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %fp)
+ ret <2 x i64> %cvt
+}
+
+define <2 x i64> @fcvtzu_uitofp_double_widen(<2 x i32> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_double_widen(
+; CHECK-NEXT: [[CVT:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[CVT]]
+;
+ %fp = uitofp <2 x i32> %a to <2 x double>
+ %cvt = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %fp)
+ ret <2 x i64> %cvt
+}
+
+; Negative: narrowing, SrcBits > DstBits not yet supported.
+define <4 x i16> @neg_fcvtzu_uitofp_narrow_known_bits(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_narrow_known_bits(
+; CHECK-NEXT: [[MASKED:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 255)
+; CHECK-NEXT: [[FP:%.*]] = uitofp nneg <4 x i32> [[MASKED]] to <4 x float>
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT: ret <4 x i16> [[CVT]]
+;
+ %masked = and <4 x i32> %a, splat (i32 255)
+ %fp = uitofp <4 x i32> %masked to <4 x float>
+ %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> %fp)
+ ret <4 x i16> %cvt
+}
+
+; Negative: narrowing, SrcBits > DstBits not yet supported.
+define <4 x i16> @neg_fcvtzs_sitofp_narrow(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzs_sitofp_narrow(
+; CHECK-NEXT: [[MASKED:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 255)
+; CHECK-NEXT: [[FP:%.*]] = uitofp nneg <4 x i32> [[MASKED]] to <4 x float>
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT: ret <4 x i16> [[CVT]]
+;
+ %masked = and <4 x i32> %a, splat (i32 255)
+ %fp = sitofp <4 x i32> %masked to <4 x float>
+ %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f32(<4 x float> %fp)
+ ret <4 x i16> %cvt
+}
+
+; Negative: 16 > 11 (half mantissa), not exact.
+define <8 x i16> @neg_fcvtzu_uitofp_full_i16(<8 x i16> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_full_i16(
+; CHECK-NEXT: [[FP:%.*]] = uitofp <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT: [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[FP]])
+; CHECK-NEXT: ret <8 x i16> [[CVT]]
+;
+ %fp = uitofp <8 x i16> %a to <8 x half>
+ %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+ ret <8 x i16> %cvt
+}
+
+; Negative: 12 > 11 (half mantissa), not exact.
+define <8 x i16> @neg_fcvtzu_uitofp_12bits(<8 x i16> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_12bits(
+; CHECK-NEXT: [[MASKED:%.*]] = and <8 x i16> [[A:%.*]], splat (i16 4095)
+; CHECK-NEXT: [[FP:%.*]] = uitofp nneg <8 x i16> [[MASKED]] to <8 x half>
+; CHECK-NEXT: [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[FP]])
+; CHECK-NEXT: ret <8 x i16> [[CVT]]
+;
+ %masked = and <8 x i16> %a, splat (i16 4095)
+ %fp = uitofp <8 x i16> %masked to <8 x half>
+ %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+ ret <8 x i16> %cvt
+}
+
+; Negative: cross-sign, fcvtzs with uitofp.
+define <4 x i32> @neg_fcvtzs_uitofp_cross_sign(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzs_uitofp_cross_sign(
+; CHECK-NEXT: [[FP:%.*]] = uitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT: ret <4 x i32> [[CVT]]
+;
+ %fp = uitofp <4 x i32> %a to <4 x float>
+ %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %fp)
+ ret <4 x i32> %cvt
+}
+
+; Negative: cross-sign, fcvtzu with sitofp.
+define <4 x i32> @neg_fcvtzu_sitofp_cross_sign(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzu_sitofp_cross_sign(
+; CHECK-NEXT: [[FP:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT: ret <4 x i32> [[CVT]]
+;
+ %fp = sitofp <4 x i32> %a to <4 x float>
+ %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %fp)
+ ret <4 x i32> %cvt
+}
+
+; Negative: 24 active bits exact in float, but 24 > 16 so doesn't fit in i16.
+define <4 x i16> @neg_fcvtzu_uitofp_narrow_no_fit(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_narrow_no_fit(
+; CHECK-NEXT: [[MASKED:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 16777215)
+; CHECK-NEXT: [[FP:%.*]] = uitofp nneg <4 x i32> [[MASKED]] to <4 x float>
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT: ret <4 x i16> [[CVT]]
+;
+ %masked = and <4 x i32> %a, splat (i32 16777215)
+ %fp = uitofp <4 x i32> %masked to <4 x float>
+ %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> %fp)
+ ret <4 x i16> %cvt
+}
+
+; Negative: not a uitofp/sitofp source.
+define <4 x i32> @neg_fcvtzu_no_cast(<4 x float> %fp) {
+; CHECK-LABEL: @neg_fcvtzu_no_cast(
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[FP:%.*]])
+; CHECK-NEXT: ret <4 x i32> [[CVT]]
+;
+ %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %fp)
+ ret <4 x i32> %cvt
+}
+
+; Negative: non-integer cast (fptrunc) feeding fcvtzu.
+define <4 x i32> @neg_fcvtzu_fptrunc(<4 x double> %a) {
+; CHECK-LABEL: @neg_fcvtzu_fptrunc(
+; CHECK-NEXT: [[TRUNC:%.*]] = fptrunc <4 x double> [[A:%.*]] to <4 x float>
+; CHECK-NEXT: [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[TRUNC]])
+; CHECK-NEXT: ret <4 x i32> [[CVT]]
+;
+ %trunc = fptrunc <4 x double> %a to <4 x float>
+ %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %trunc)
+ ret <4 x i32> %cvt
+}
More information about the llvm-branch-commits
mailing list