[llvm] [DAGCombiner] Extend fp->int->fp optimizations to include clamping (PR #164502)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 03:24:57 PST 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/164502
>From 74d25d1f6508106125e8393ab146c29016a59a1a Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 21 Oct 2025 23:41:43 +0300
Subject: [PATCH] [DAGCombiner] Extend fp->int->fp optimizations to include
clamping
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 79 +++++++---
llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll | 140 ++++++++++++++++++
2 files changed, 201 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c9513611e6dcb..a8c5f136061cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/IntervalMap.h"
@@ -18984,6 +18985,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const TargetLowering &TLI) {
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
+ // Additionally, if there are clamps ([us]min or [us]max) around
+ // the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc.
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
// Otherwise, for strict math, we must handle edge cases:
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
@@ -18995,28 +18998,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
return SDValue();
- // fptosi/fptoui round towards zero, so converting from FP to integer and
- // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
- SDValue N0 = N->getOperand(0);
- if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
- N0.getOperand(0).getValueType() == VT) {
- if (DAG.getTarget().Options.NoSignedZerosFPMath)
- return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
- }
+ bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP;
+ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
+ assert(IsSigned || IsUnsigned);
- if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
- N0.getOperand(0).getValueType() == VT) {
- if (DAG.getTarget().Options.NoSignedZerosFPMath)
- return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+ bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
+ // For signed conversions: The optimization changes signed zero behavior.
+ if (IsSigned && !IsSignedZeroSafe)
+ return SDValue();
+ // For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
+ // (unless NoSignedZerosFPMath is set).
+ if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
+ return SDValue();
- // Strict math: use FABS to handle negative inputs correctly.
- if (TLI.isFAbsFree(VT)) {
- SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
- return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
- }
+ // Collect potential clamp operations (innermost to outermost) and peel.
+ struct ClampOp {
+ unsigned Opcode;
+ SDValue Constant;
+ };
+ SmallVector<ClampOp, 2> Clamps;
+ unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
+ SDValue IntVal = N->getOperand(0);
+ constexpr unsigned MaxClampLevels = 2;
+ for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
+ if (!IntVal.hasOneUse() ||
+ (IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
+ break;
+ unsigned FPClampOp =
+ (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
+ if (!TLI.isOperationLegal(FPClampOp, VT))
+ return SDValue();
+ auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
+ if (!IntConstNode)
+ return SDValue();
+ APFloat FPConst(VT.getFltSemantics());
+ APInt IntConst = IntConstNode->getAPIntValue();
+ FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
+ // Verify roundtrip exactness.
+ APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
+ bool IsExact;
+ if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) !=
+ APFloat::opOK ||
+ !IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
+ return SDValue();
+ Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
+ IntVal = IntVal.getOperand(0);
}
- return SDValue();
+ // Check that the sequence ends with a FPTo[us]i of the right type.
+ unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
+ if (IntVal.getOpcode() != FPToIntOp ||
+ IntVal.getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ SDValue Result = IntVal.getOperand(0);
+ if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT))
+ Result = DAG.getNode(ISD::FABS, DL, VT, Result);
+ Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
+ // Apply clamps, if any, in reverse order (innermost first).
+ for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
+ Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
+ return Result;
}
SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
new file mode 100644
index 0000000000000..9a8c555953611
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
+
+; Test folding of float->int->float roundtrips into float-only operations.
+; The optimization could converts patterns like:
+; sitofp(fptosi(x)) -> ftrunc(x)
+; sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C)
+; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
+
+define float @test_signed_basic(float %x) {
+; CHECK-LABEL: test_signed_basic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi float %x to i32
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+define float @test_unsigned_basic(float %x) {
+; CHECK-LABEL: test_unsigned_basic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ucvtf s0, s0
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui float %x to i32
+ %f = uitofp i32 %i to float
+ ret float %f
+}
+
+define float @test_signed_min_max(float %x) {
+; CHECK-LABEL: test_signed_min_max:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w9, s0
+; CHECK-NEXT: mov w8, #-512 // =0xfffffe00
+; CHECK-NEXT: cmn w9, #512
+; CHECK-NEXT: csel w8, w9, w8, gt
+; CHECK-NEXT: mov w9, #1023 // =0x3ff
+; CHECK-NEXT: cmp w8, #1023
+; CHECK-NEXT: csel w8, w8, w9, lt
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #196, lsl #24
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi float %x to i32
+ %lower = call i32 @llvm.smax.i32(i32 %i, i32 -512)
+ %clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023)
+ %f = sitofp i32 %clamped to float
+ ret float %f
+}
+
+define float @test_unsigned_min_max(float %x) {
+; CHECK-LABEL: test_unsigned_min_max:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu w9, s0
+; CHECK-NEXT: mov w8, #512 // =0x200
+; CHECK-NEXT: cmp w9, #512
+; CHECK-NEXT: csel w8, w9, w8, hi
+; CHECK-NEXT: mov w9, #1023 // =0x3ff
+; CHECK-NEXT: cmp w8, #1023
+; CHECK-NEXT: csel w8, w8, w9, lo
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #68, lsl #24
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui float %x to i32
+ %lower = call i32 @llvm.umax.i32(i32 %i, i32 512)
+ %clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023)
+ %f = uitofp i32 %clamped to float
+ ret float %f
+}
+
+; 16777217 is NOT exactly representable in f32.
+define float @test_inexact_16777217(float %x) {
+; CHECK-LABEL: test_inexact_16777217:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov w9, #16777216 // =0x1000000
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: mov w9, #1 // =0x1
+; CHECK-NEXT: movk w9, #256, lsl #16
+; CHECK-NEXT: csel w8, w8, w9, le
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: fcvtzs w8, s0
+; NO-SIGNED-ZEROS-NEXT: mov w9, #16777216 // =0x1000000
+; NO-SIGNED-ZEROS-NEXT: cmp w8, w9
+; NO-SIGNED-ZEROS-NEXT: mov w9, #1 // =0x1
+; NO-SIGNED-ZEROS-NEXT: movk w9, #256, lsl #16
+; NO-SIGNED-ZEROS-NEXT: csel w8, w8, w9, le
+; NO-SIGNED-ZEROS-NEXT: scvtf s0, w8
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi float %x to i32
+ %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
+ %f = sitofp i32 %clamped to float
+ ret float %f
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
More information about the llvm-commits
mailing list