[llvm-branch-commits] [llvm] [AArch64] Fold fcvtzu/fcvtzs(uitofp/sitofp(x)) roundtrip (PR #190328)

Fri Apr 3 02:42:44 PDT 2026

https://github.com/SavchenkoValeriy created https://github.com/llvm/llvm-project/pull/190328

[AArch64] Fold fcvtzu/fcvtzs(uitofp/sitofp(x)) roundtrip

>From 2aa778f29f073b514fe596bed322823267cc041f Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Thu, 2 Apr 2026 22:42:34 +0100
Subject: [PATCH] [AArch64] Fold fcvtzu/fcvtzs(uitofp/sitofp(x)) roundtrip

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  47 ++++
 .../AArch64/neon-fcvtz-roundtrip.ll           | 210 ++++++++++++++++++
 2 files changed, 257 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 734339e5c7a05..7412a7470fa41 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2869,6 +2869,50 @@ static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
   return std::nullopt;
 }
 
+// fcvtzu(uitofp(x)) -> x or zext(x)
+// fcvtzs(sitofp(x)) -> x or sext(x)
+//
+// These NEON intrinsics perform saturating conversions, so unlike generic
+// fptoui/fptosi we cannot exploit UB on overflow. We must prove:
+//   1. The int->FP cast is exact (no precision loss)
+//   2. The value fits in the destination integer type
+static std::optional<Instruction *> instCombineNeonFCvtz(InstCombiner &IC,
+                                                         IntrinsicInst &II) {
+  bool IsUnsigned = II.getIntrinsicID() == Intrinsic::aarch64_neon_fcvtzu;
+
+  // Only handle same-sign cases for now.
+  CastInst *IToFP;
+  if (IsUnsigned)
+    IToFP = dyn_cast<UIToFPInst>(II.getArgOperand(0));
+  else
+    IToFP = dyn_cast<SIToFPInst>(II.getArgOperand(0));
+  if (!IToFP)
+    return std::nullopt;
+
+  // Check that int->FP is exact (no rounding).
+  if (!IC.isKnownExactCastIntToFP(*IToFP))
+    return std::nullopt;
+
+  Value *X = IToFP->getOperand(0);
+  Type *SrcIntTy = X->getType();
+  Type *DstIntTy = II.getType();
+  unsigned SrcBits = SrcIntTy->getScalarSizeInBits();
+  unsigned DstBits = DstIntTy->getScalarSizeInBits();
+
+  // Check that the value fits in the destination integer type (no saturation).
+  // TODO: support narrowing with MaxActiveBits analysis.
+  if (SrcBits > DstBits)
+    return std::nullopt;
+
+  if (SrcBits == DstBits)
+    return IC.replaceInstUsesWith(II, X);
+
+  IC.Builder.SetInsertPoint(&II);
+  Value *Ext = IsUnsigned ? IC.Builder.CreateZExt(X, DstIntTy)
+                          : IC.Builder.CreateSExt(X, DstIntTy);
+  return IC.replaceInstUsesWith(II, Ext);
+}
+
 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
                                                         IntrinsicInst &II) {
   Value *Pred = II.getOperand(0);
@@ -3095,6 +3139,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEUxt(IC, II, 32);
   case Intrinsic::aarch64_sme_in_streaming_mode:
     return instCombineInStreamingMode(IC, II);
+  case Intrinsic::aarch64_neon_fcvtzu:
+  case Intrinsic::aarch64_neon_fcvtzs:
+    return instCombineNeonFCvtz(IC, II);
   }
 
   return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll b/llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll
new file mode 100644
index 0000000000000..c1397ea0fefb8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/neon-fcvtz-roundtrip.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine -mtriple aarch64 %s | FileCheck %s
+
+define <8 x i16> @fcvtzu_uitofp_zext_i8(<8 x i8> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_zext_i8(
+; CHECK-NEXT:    [[CVT:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[CVT]]
+;
+  %zext = zext <8 x i8> %a to <8 x i16>
+  %fp = uitofp <8 x i16> %zext to <8 x half>
+  %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+  ret <8 x i16> %cvt
+}
+
+define <8 x i16> @fcvtzu_uitofp_and_11bits(<8 x i16> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_and_11bits(
+; CHECK-NEXT:    [[MASKED:%.*]] = and <8 x i16> [[A:%.*]], splat (i16 2047)
+; CHECK-NEXT:    ret <8 x i16> [[MASKED]]
+;
+  %masked = and <8 x i16> %a, splat (i16 2047)
+  %fp = uitofp <8 x i16> %masked to <8 x half>
+  %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+  ret <8 x i16> %cvt
+}
+
+define <4 x i32> @fcvtzs_sitofp_sext_i16(<4 x i16> %a) {
+; CHECK-LABEL: @fcvtzs_sitofp_sext_i16(
+; CHECK-NEXT:    [[CVT:%.*]] = sext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[CVT]]
+;
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %fp = sitofp <4 x i32> %sext to <4 x float>
+  %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %fp)
+  ret <4 x i32> %cvt
+}
+
+define <4 x i16> @fcvtzu_uitofp_and_255(<4 x i16> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_and_255(
+; CHECK-NEXT:    [[MASKED:%.*]] = and <4 x i16> [[A:%.*]], splat (i16 255)
+; CHECK-NEXT:    ret <4 x i16> [[MASKED]]
+;
+  %masked = and <4 x i16> %a, splat (i16 255)
+  %fp = uitofp <4 x i16> %masked to <4 x half>
+  %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %fp)
+  ret <4 x i16> %cvt
+}
+
+define <4 x i32> @fcvtzu_uitofp_widen_i16_to_i32(<4 x i16> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_widen_i16_to_i32(
+; CHECK-NEXT:    [[CVT:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[CVT]]
+;
+  %fp = uitofp <4 x i16> %a to <4 x float>
+  %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %fp)
+  ret <4 x i32> %cvt
+}
+
+define i32 @fcvtzu_uitofp_scalar_widen(i16 %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_scalar_widen(
+; CHECK-NEXT:    [[CVT:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[CVT]]
+;
+  %fp = uitofp i16 %a to float
+  %cvt = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %fp)
+  ret i32 %cvt
+}
+
+define i32 @fcvtzs_sitofp_scalar_widen(i16 %a) {
+; CHECK-LABEL: @fcvtzs_sitofp_scalar_widen(
+; CHECK-NEXT:    [[CVT:%.*]] = sext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[CVT]]
+;
+  %fp = sitofp i16 %a to float
+  %cvt = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %fp)
+  ret i32 %cvt
+}
+
+define <2 x i64> @fcvtzs_sitofp_double_widen(<2 x i32> %a) {
+; CHECK-LABEL: @fcvtzs_sitofp_double_widen(
+; CHECK-NEXT:    [[CVT:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CVT]]
+;
+  %fp = sitofp <2 x i32> %a to <2 x double>
+  %cvt = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %fp)
+  ret <2 x i64> %cvt
+}
+
+define <2 x i64> @fcvtzu_uitofp_double_widen(<2 x i32> %a) {
+; CHECK-LABEL: @fcvtzu_uitofp_double_widen(
+; CHECK-NEXT:    [[CVT:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CVT]]
+;
+  %fp = uitofp <2 x i32> %a to <2 x double>
+  %cvt = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %fp)
+  ret <2 x i64> %cvt
+}
+
+; Negative: narrowing, SrcBits > DstBits not yet supported.
+define <4 x i16> @neg_fcvtzu_uitofp_narrow_known_bits(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_narrow_known_bits(
+; CHECK-NEXT:    [[MASKED:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 255)
+; CHECK-NEXT:    [[FP:%.*]] = uitofp nneg <4 x i32> [[MASKED]] to <4 x float>
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT:    ret <4 x i16> [[CVT]]
+;
+  %masked = and <4 x i32> %a, splat (i32 255)
+  %fp = uitofp <4 x i32> %masked to <4 x float>
+  %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> %fp)
+  ret <4 x i16> %cvt
+}
+
+; Negative: narrowing, SrcBits > DstBits not yet supported.
+define <4 x i16> @neg_fcvtzs_sitofp_narrow(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzs_sitofp_narrow(
+; CHECK-NEXT:    [[MASKED:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 255)
+; CHECK-NEXT:    [[FP:%.*]] = uitofp nneg <4 x i32> [[MASKED]] to <4 x float>
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT:    ret <4 x i16> [[CVT]]
+;
+  %masked = and <4 x i32> %a, splat (i32 255)
+  %fp = sitofp <4 x i32> %masked to <4 x float>
+  %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f32(<4 x float> %fp)
+  ret <4 x i16> %cvt
+}
+
+; Negative: 16 > 11 (half mantissa), not exact.
+define <8 x i16> @neg_fcvtzu_uitofp_full_i16(<8 x i16> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_full_i16(
+; CHECK-NEXT:    [[FP:%.*]] = uitofp <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[FP]])
+; CHECK-NEXT:    ret <8 x i16> [[CVT]]
+;
+  %fp = uitofp <8 x i16> %a to <8 x half>
+  %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+  ret <8 x i16> %cvt
+}
+
+; Negative: 12 > 11 (half mantissa), not exact.
+define <8 x i16> @neg_fcvtzu_uitofp_12bits(<8 x i16> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_12bits(
+; CHECK-NEXT:    [[MASKED:%.*]] = and <8 x i16> [[A:%.*]], splat (i16 4095)
+; CHECK-NEXT:    [[FP:%.*]] = uitofp nneg <8 x i16> [[MASKED]] to <8 x half>
+; CHECK-NEXT:    [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[FP]])
+; CHECK-NEXT:    ret <8 x i16> [[CVT]]
+;
+  %masked = and <8 x i16> %a, splat (i16 4095)
+  %fp = uitofp <8 x i16> %masked to <8 x half>
+  %cvt = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %fp)
+  ret <8 x i16> %cvt
+}
+
+; Negative: cross-sign, fcvtzs with uitofp.
+define <4 x i32> @neg_fcvtzs_uitofp_cross_sign(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzs_uitofp_cross_sign(
+; CHECK-NEXT:    [[FP:%.*]] = uitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT:    ret <4 x i32> [[CVT]]
+;
+  %fp = uitofp <4 x i32> %a to <4 x float>
+  %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %fp)
+  ret <4 x i32> %cvt
+}
+
+; Negative: cross-sign, fcvtzu with sitofp.
+define <4 x i32> @neg_fcvtzu_sitofp_cross_sign(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzu_sitofp_cross_sign(
+; CHECK-NEXT:    [[FP:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT:    ret <4 x i32> [[CVT]]
+;
+  %fp = sitofp <4 x i32> %a to <4 x float>
+  %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %fp)
+  ret <4 x i32> %cvt
+}
+
+; Negative: 24 active bits exact in float, but 24 > 16 so doesn't fit in i16.
+define <4 x i16> @neg_fcvtzu_uitofp_narrow_no_fit(<4 x i32> %a) {
+; CHECK-LABEL: @neg_fcvtzu_uitofp_narrow_no_fit(
+; CHECK-NEXT:    [[MASKED:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 16777215)
+; CHECK-NEXT:    [[FP:%.*]] = uitofp nneg <4 x i32> [[MASKED]] to <4 x float>
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> [[FP]])
+; CHECK-NEXT:    ret <4 x i16> [[CVT]]
+;
+  %masked = and <4 x i32> %a, splat (i32 16777215)
+  %fp = uitofp <4 x i32> %masked to <4 x float>
+  %cvt = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f32(<4 x float> %fp)
+  ret <4 x i16> %cvt
+}
+
+; Negative: not a uitofp/sitofp source.
+define <4 x i32> @neg_fcvtzu_no_cast(<4 x float> %fp) {
+; CHECK-LABEL: @neg_fcvtzu_no_cast(
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[FP:%.*]])
+; CHECK-NEXT:    ret <4 x i32> [[CVT]]
+;
+  %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %fp)
+  ret <4 x i32> %cvt
+}
+
+; Negative: non-integer cast (fptrunc) feeding fcvtzu.
+define <4 x i32> @neg_fcvtzu_fptrunc(<4 x double> %a) {
+; CHECK-LABEL: @neg_fcvtzu_fptrunc(
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <4 x double> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[CVT:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[TRUNC]])
+; CHECK-NEXT:    ret <4 x i32> [[CVT]]
+;
+  %trunc = fptrunc <4 x double> %a to <4 x float>
+  %cvt = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %trunc)
+  ret <4 x i32> %cvt
+}