[llvm] [DAGCombiner] Extend fp->int->fp optimizations to include clamping (PR #164502)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 14 03:24:32 PST 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/164502
>From 110a2d1c38047f1f82ebc7316d9fb7ad1c9b8945 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 21 Oct 2025 23:41:43 +0300
Subject: [PATCH 1/2] [DAGCombiner] Extend fp->int->fp optimizations to include
clamping
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 79 +++++++---
llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll | 140 ++++++++++++++++++
2 files changed, 201 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 77346cb7bfac0..ad75d2cf5a8ac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/IntervalMap.h"
@@ -19040,6 +19041,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const TargetLowering &TLI) {
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
+ // Additionally, if there are clamps ([us]min or [us]max) around
+ // the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc.
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
// Otherwise, for strict math, we must handle edge cases:
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
@@ -19051,28 +19054,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
return SDValue();
- // fptosi/fptoui round towards zero, so converting from FP to integer and
- // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
- SDValue N0 = N->getOperand(0);
- if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
- N0.getOperand(0).getValueType() == VT) {
- if (DAG.getTarget().Options.NoSignedZerosFPMath)
- return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
- }
+ bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP;
+ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
+ assert(IsSigned || IsUnsigned);
- if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
- N0.getOperand(0).getValueType() == VT) {
- if (DAG.getTarget().Options.NoSignedZerosFPMath)
- return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+ bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
+ // For signed conversions: The optimization changes signed zero behavior.
+ if (IsSigned && !IsSignedZeroSafe)
+ return SDValue();
+ // For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
+ // (unless NoSignedZerosFPMath is set).
+ if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
+ return SDValue();
- // Strict math: use FABS to handle negative inputs correctly.
- if (TLI.isFAbsFree(VT)) {
- SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
- return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
- }
+ // Collect potential clamp operations (innermost to outermost) and peel.
+ struct ClampOp {
+ unsigned Opcode;
+ SDValue Constant;
+ };
+ SmallVector<ClampOp, 2> Clamps;
+ unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
+ SDValue IntVal = N->getOperand(0);
+ constexpr unsigned MaxClampLevels = 2;
+ for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
+ if (!IntVal.hasOneUse() ||
+ (IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
+ break;
+ unsigned FPClampOp =
+ (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
+ if (!TLI.isOperationLegal(FPClampOp, VT))
+ return SDValue();
+ auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
+ if (!IntConstNode)
+ return SDValue();
+ APFloat FPConst(VT.getFltSemantics());
+ APInt IntConst = IntConstNode->getAPIntValue();
+ FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
+ // Verify roundtrip exactness.
+ APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
+ bool IsExact;
+ if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) !=
+ APFloat::opOK ||
+ !IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
+ return SDValue();
+ Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
+ IntVal = IntVal.getOperand(0);
}
- return SDValue();
+ // Check that the sequence ends with a FPTo[us]i of the right type.
+ unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
+ if (IntVal.getOpcode() != FPToIntOp ||
+ IntVal.getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ SDValue Result = IntVal.getOperand(0);
+ if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT))
+ Result = DAG.getNode(ISD::FABS, DL, VT, Result);
+ Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
+ // Apply clamps, if any, in reverse order (innermost first).
+ for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
+ Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
+ return Result;
}
SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
new file mode 100644
index 0000000000000..9a8c555953611
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
+
+; Test folding of float->int->float roundtrips into float-only operations.
+; The optimization could converts patterns like:
+; sitofp(fptosi(x)) -> ftrunc(x)
+; sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C)
+; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
+
+define float @test_signed_basic(float %x) {
+; CHECK-LABEL: test_signed_basic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi float %x to i32
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+define float @test_unsigned_basic(float %x) {
+; CHECK-LABEL: test_unsigned_basic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ucvtf s0, s0
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui float %x to i32
+ %f = uitofp i32 %i to float
+ ret float %f
+}
+
+define float @test_signed_min_max(float %x) {
+; CHECK-LABEL: test_signed_min_max:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w9, s0
+; CHECK-NEXT: mov w8, #-512 // =0xfffffe00
+; CHECK-NEXT: cmn w9, #512
+; CHECK-NEXT: csel w8, w9, w8, gt
+; CHECK-NEXT: mov w9, #1023 // =0x3ff
+; CHECK-NEXT: cmp w8, #1023
+; CHECK-NEXT: csel w8, w8, w9, lt
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #196, lsl #24
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi float %x to i32
+ %lower = call i32 @llvm.smax.i32(i32 %i, i32 -512)
+ %clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023)
+ %f = sitofp i32 %clamped to float
+ ret float %f
+}
+
+define float @test_unsigned_min_max(float %x) {
+; CHECK-LABEL: test_unsigned_min_max:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu w9, s0
+; CHECK-NEXT: mov w8, #512 // =0x200
+; CHECK-NEXT: cmp w9, #512
+; CHECK-NEXT: csel w8, w9, w8, hi
+; CHECK-NEXT: mov w9, #1023 // =0x3ff
+; CHECK-NEXT: cmp w8, #1023
+; CHECK-NEXT: csel w8, w8, w9, lo
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #68, lsl #24
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui float %x to i32
+ %lower = call i32 @llvm.umax.i32(i32 %i, i32 512)
+ %clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023)
+ %f = uitofp i32 %clamped to float
+ ret float %f
+}
+
+; 16777217 is NOT exactly representable in f32.
+define float @test_inexact_16777217(float %x) {
+; CHECK-LABEL: test_inexact_16777217:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov w9, #16777216 // =0x1000000
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: mov w9, #1 // =0x1
+; CHECK-NEXT: movk w9, #256, lsl #16
+; CHECK-NEXT: csel w8, w8, w9, le
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: fcvtzs w8, s0
+; NO-SIGNED-ZEROS-NEXT: mov w9, #16777216 // =0x1000000
+; NO-SIGNED-ZEROS-NEXT: cmp w8, w9
+; NO-SIGNED-ZEROS-NEXT: mov w9, #1 // =0x1
+; NO-SIGNED-ZEROS-NEXT: movk w9, #256, lsl #16
+; NO-SIGNED-ZEROS-NEXT: csel w8, w8, w9, le
+; NO-SIGNED-ZEROS-NEXT: scvtf s0, w8
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi float %x to i32
+ %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
+ %f = sitofp i32 %clamped to float
+ ret float %f
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
>From 561bded0d1d8662488594dbd191ff0135b45594c Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 21 Oct 2025 23:41:43 +0300
Subject: [PATCH 2/2] Address comments
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 68 +++---
llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll | 210 ++++++++++++++----
llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll | 62 ++++++
3 files changed, 270 insertions(+), 70 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ad75d2cf5a8ac..f78b38aa4c57b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6579,12 +6579,25 @@ static bool arebothOperandsNotNan(SDValue Operand1, SDValue Operand2,
return DAG.isKnownNeverNaN(Operand2) && DAG.isKnownNeverNaN(Operand1);
}
+/// Returns an appropriate FP min/max opcode for clamping operations.
+static unsigned getMinMaxOpcodeForClamp(bool IsMin, SDValue Operand1,
+ SDValue Operand2, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ EVT VT = Operand1.getValueType();
+ unsigned IEEEOp = IsMin ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+ if (TLI.isOperationLegalOrCustom(IEEEOp, VT) &&
+ arebothOperandsNotNan(Operand1, Operand2, DAG))
+ return IEEEOp;
+ unsigned PreferredOp = IsMin ? ISD::FMINNUM : ISD::FMAXNUM;
+ if (TLI.isOperationLegalOrCustom(PreferredOp, VT))
+ return PreferredOp;
+ return ISD::DELETED_NODE;
+}
+
// FIXME: use FMINIMUMNUM if possible, such as for RISC-V.
-static unsigned getMinMaxOpcodeForFP(SDValue Operand1, SDValue Operand2,
- ISD::CondCode CC, unsigned OrAndOpcode,
- SelectionDAG &DAG,
- bool isFMAXNUMFMINNUM_IEEE,
- bool isFMAXNUMFMINNUM) {
+static unsigned getMinMaxOpcodeForCompareFold(
+ SDValue Operand1, SDValue Operand2, ISD::CondCode CC, unsigned OrAndOpcode,
+ SelectionDAG &DAG, bool isFMAXNUMFMINNUM_IEEE, bool isFMAXNUMFMINNUM) {
// The optimization cannot be applied for all the predicates because
// of the way FMINNUM/FMAXNUM and FMINNUM_IEEE/FMAXNUM_IEEE handle
// NaNs. For FMINNUM_IEEE/FMAXNUM_IEEE, the optimization cannot be
@@ -6742,9 +6755,9 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) {
else
NewOpcode = IsSigned ? ISD::SMAX : ISD::UMAX;
} else if (OpVT.isFloatingPoint())
- NewOpcode =
- getMinMaxOpcodeForFP(Operand1, Operand2, CC, LogicOp->getOpcode(),
- DAG, isFMAXNUMFMINNUM_IEEE, isFMAXNUMFMINNUM);
+ NewOpcode = getMinMaxOpcodeForCompareFold(
+ Operand1, Operand2, CC, LogicOp->getOpcode(), DAG,
+ isFMAXNUMFMINNUM_IEEE, isFMAXNUMFMINNUM);
if (NewOpcode != ISD::DELETED_NODE) {
SDValue MinMaxValue =
@@ -19067,29 +19080,28 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
return SDValue();
- // Collect potential clamp operations (innermost to outermost) and peel.
- struct ClampOp {
- unsigned Opcode;
+ // Collect potential clamp operations (outermost to innermost) and peel.
+ struct ClampInfo {
+ bool IsMin;
SDValue Constant;
};
- SmallVector<ClampOp, 2> Clamps;
+ constexpr unsigned MaxClamps = 2;
+ SmallVector<ClampInfo, MaxClamps> Clamps;
unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
SDValue IntVal = N->getOperand(0);
- constexpr unsigned MaxClampLevels = 2;
- for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
+ for (unsigned Level = 0; Level < MaxClamps; ++Level) {
if (!IntVal.hasOneUse() ||
(IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
break;
- unsigned FPClampOp =
- (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
- if (!TLI.isOperationLegal(FPClampOp, VT))
- return SDValue();
- auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
- if (!IntConstNode)
+ SDValue RHS = IntVal.getOperand(1);
+ APInt IntConst;
+ if (auto *IntConstNode = dyn_cast<ConstantSDNode>(RHS))
+ IntConst = IntConstNode->getAPIntValue();
+ else if (!ISD::isConstantSplatVector(RHS.getNode(),
+ IntConst))
return SDValue();
APFloat FPConst(VT.getFltSemantics());
- APInt IntConst = IntConstNode->getAPIntValue();
FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
// Verify roundtrip exactness.
APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
@@ -19098,11 +19110,12 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
APFloat::opOK ||
!IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
return SDValue();
- Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
+ bool IsMin = IntVal.getOpcode() == MinOp;
+ Clamps.push_back({IsMin, DAG.getConstantFP(FPConst, DL, VT)});
IntVal = IntVal.getOperand(0);
}
- // Check that the sequence ends with a FPTo[us]i of the right type.
+ // Check that the sequence ends with the correct kind of fpto[us]i.
unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
if (IntVal.getOpcode() != FPToIntOp ||
IntVal.getOperand(0).getValueType() != VT)
@@ -19113,8 +19126,13 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
Result = DAG.getNode(ISD::FABS, DL, VT, Result);
Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
// Apply clamps, if any, in reverse order (innermost first).
- for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
- Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
+ for (const ClampInfo &Clamp : reverse(Clamps)) {
+ unsigned FPClampOp =
+ getMinMaxOpcodeForClamp(Clamp.IsMin, Result, Clamp.Constant, DAG, TLI);
+ if (FPClampOp == ISD::DELETED_NODE)
+ return SDValue();
+ Result = DAG.getNode(FPClampOp, DL, VT, Result, Clamp.Constant);
+ }
return Result;
}
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
index 9a8c555953611..829782ceb9842 100644
--- a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,SIGNED-ZEROS
+; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NO-SIGNED-ZEROS
; Test folding of float->int->float roundtrips into float-only operations.
; The optimization could converts patterns like:
@@ -9,11 +9,11 @@
; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
define float @test_signed_basic(float %x) {
-; CHECK-LABEL: test_signed_basic:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs s0, s0
-; CHECK-NEXT: scvtf s0, s0
-; CHECK-NEXT: ret
+; SIGNED-ZEROS-LABEL: test_signed_basic:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzs s0, s0
+; SIGNED-ZEROS-NEXT: scvtf s0, s0
+; SIGNED-ZEROS-NEXT: ret
;
; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
; NO-SIGNED-ZEROS: // %bb.0: // %entry
@@ -26,11 +26,11 @@ entry:
}
define float @test_unsigned_basic(float %x) {
-; CHECK-LABEL: test_unsigned_basic:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu s0, s0
-; CHECK-NEXT: ucvtf s0, s0
-; CHECK-NEXT: ret
+; SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzu s0, s0
+; SIGNED-ZEROS-NEXT: ucvtf s0, s0
+; SIGNED-ZEROS-NEXT: ret
;
; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
; NO-SIGNED-ZEROS: // %bb.0: // %entry
@@ -43,17 +43,17 @@ entry:
}
define float @test_signed_min_max(float %x) {
-; CHECK-LABEL: test_signed_min_max:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs w9, s0
-; CHECK-NEXT: mov w8, #-512 // =0xfffffe00
-; CHECK-NEXT: cmn w9, #512
-; CHECK-NEXT: csel w8, w9, w8, gt
-; CHECK-NEXT: mov w9, #1023 // =0x3ff
-; CHECK-NEXT: cmp w8, #1023
-; CHECK-NEXT: csel w8, w8, w9, lt
-; CHECK-NEXT: scvtf s0, w8
-; CHECK-NEXT: ret
+; SIGNED-ZEROS-LABEL: test_signed_min_max:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzs w9, s0
+; SIGNED-ZEROS-NEXT: mov w8, #-512 // =0xfffffe00
+; SIGNED-ZEROS-NEXT: cmn w9, #512
+; SIGNED-ZEROS-NEXT: csel w8, w9, w8, gt
+; SIGNED-ZEROS-NEXT: mov w9, #1023 // =0x3ff
+; SIGNED-ZEROS-NEXT: cmp w8, #1023
+; SIGNED-ZEROS-NEXT: csel w8, w8, w9, lt
+; SIGNED-ZEROS-NEXT: scvtf s0, w8
+; SIGNED-ZEROS-NEXT: ret
;
; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
; NO-SIGNED-ZEROS: // %bb.0: // %entry
@@ -74,17 +74,17 @@ entry:
}
define float @test_unsigned_min_max(float %x) {
-; CHECK-LABEL: test_unsigned_min_max:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu w9, s0
-; CHECK-NEXT: mov w8, #512 // =0x200
-; CHECK-NEXT: cmp w9, #512
-; CHECK-NEXT: csel w8, w9, w8, hi
-; CHECK-NEXT: mov w9, #1023 // =0x3ff
-; CHECK-NEXT: cmp w8, #1023
-; CHECK-NEXT: csel w8, w8, w9, lo
-; CHECK-NEXT: ucvtf s0, w8
-; CHECK-NEXT: ret
+; SIGNED-ZEROS-LABEL: test_unsigned_min_max:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzu w9, s0
+; SIGNED-ZEROS-NEXT: mov w8, #512 // =0x200
+; SIGNED-ZEROS-NEXT: cmp w9, #512
+; SIGNED-ZEROS-NEXT: csel w8, w9, w8, hi
+; SIGNED-ZEROS-NEXT: mov w9, #1023 // =0x3ff
+; SIGNED-ZEROS-NEXT: cmp w8, #1023
+; SIGNED-ZEROS-NEXT: csel w8, w8, w9, lo
+; SIGNED-ZEROS-NEXT: ucvtf s0, w8
+; SIGNED-ZEROS-NEXT: ret
;
; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
; NO-SIGNED-ZEROS: // %bb.0: // %entry
@@ -116,17 +116,6 @@ define float @test_inexact_16777217(float %x) {
; CHECK-NEXT: csel w8, w8, w9, le
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: ret
-;
-; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
-; NO-SIGNED-ZEROS: // %bb.0: // %entry
-; NO-SIGNED-ZEROS-NEXT: fcvtzs w8, s0
-; NO-SIGNED-ZEROS-NEXT: mov w9, #16777216 // =0x1000000
-; NO-SIGNED-ZEROS-NEXT: cmp w8, w9
-; NO-SIGNED-ZEROS-NEXT: mov w9, #1 // =0x1
-; NO-SIGNED-ZEROS-NEXT: movk w9, #256, lsl #16
-; NO-SIGNED-ZEROS-NEXT: csel w8, w8, w9, le
-; NO-SIGNED-ZEROS-NEXT: scvtf s0, w8
-; NO-SIGNED-ZEROS-NEXT: ret
entry:
%i = fptosi float %x to i32
%clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
@@ -134,7 +123,138 @@ entry:
ret float %f
}
+define <4 x float> @test_signed_v4f32(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_signed_v4f32:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzs v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: scvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_v4f32:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi <4 x float> %x to <4 x i32>
+ %f = sitofp <4 x i32> %i to <4 x float>
+ ret <4 x float> %f
+}
+
+define <4 x float> @test_unsigned_v4f32(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_v4f32:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzu v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: ucvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_v4f32:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui <4 x float> %x to <4 x i32>
+ %f = uitofp <4 x i32> %i to <4 x float>
+ ret <4 x float> %f
+}
+
+define <2 x double> @test_signed_v2f64(<2 x double> %x) {
+; SIGNED-ZEROS-LABEL: test_signed_v2f64:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzs v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT: scvtf v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_v2f64:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz v0.2d, v0.2d
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi <2 x double> %x to <2 x i64>
+ %f = sitofp <2 x i64> %i to <2 x double>
+ ret <2 x double> %f
+}
+
+define <2 x double> @test_unsigned_v2f64(<2 x double> %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_v2f64:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzu v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT: ucvtf v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_v2f64:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: frintz v0.2d, v0.2d
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui <2 x double> %x to <2 x i64>
+ %f = uitofp <2 x i64> %i to <2 x double>
+ ret <2 x double> %f
+}
+
+define <4 x float> @test_signed_v4f32_min_max(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_signed_v4f32_min_max:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: fcvtzs v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: mvni v1.4s, #1, msl #8
+; SIGNED-ZEROS-NEXT: smax v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT: movi v1.4s, #3, msl #8
+; SIGNED-ZEROS-NEXT: smin v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT: scvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_v4f32_min_max:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: movi v1.4s, #196, lsl #24
+; NO-SIGNED-ZEROS-NEXT: frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT: dup v1.4s, w8
+; NO-SIGNED-ZEROS-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptosi <4 x float> %x to <4 x i32>
+ %lower = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %i, <4 x i32> splat (i32 -512))
+ %clamped = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %lower, <4 x i32> splat (i32 1023))
+ %f = sitofp <4 x i32> %clamped to <4 x float>
+ ret <4 x float> %f
+}
+
+define <4 x float> @test_unsigned_v4f32_min_max(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_v4f32_min_max:
+; SIGNED-ZEROS: // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT: movi v1.4s, #2, lsl #8
+; SIGNED-ZEROS-NEXT: fcvtzu v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: umax v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT: movi v1.4s, #3, msl #8
+; SIGNED-ZEROS-NEXT: umin v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT: ucvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_v4f32_min_max:
+; NO-SIGNED-ZEROS: // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT: movi v1.4s, #68, lsl #24
+; NO-SIGNED-ZEROS-NEXT: frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT: dup v1.4s, w8
+; NO-SIGNED-ZEROS-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT: ret
+entry:
+ %i = fptoui <4 x float> %x to <4 x i32>
+ %lower = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %i, <4 x i32> splat (i32 512))
+ %clamped = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %lower, <4 x i32> splat (i32 1023))
+ %f = uitofp <4 x i32> %clamped to <4 x float>
+ ret <4 x float> %f
+}
+
+
declare i32 @llvm.smin.i32(i32, i32)
declare i32 @llvm.smax.i32(i32, i32)
declare i32 @llvm.umin.i32(i32, i32)
declare i32 @llvm.umax.i32(i32, i32)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll
new file mode 100644
index 0000000000000..2416d6a852eb9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s --check-prefixes=CHECK,SIGNED-ZEROS
+; RUN: llc -mtriple=amdgcn --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NO-SIGNED-ZEROS
+
+; Test folding of float->int->float roundtrips into float-only operations.
+
+define float @test_signed_basic(float %x) {
+; SIGNED-ZEROS-LABEL: test_signed_basic:
+; SIGNED-ZEROS: ; %bb.0: ; %entry
+; SIGNED-ZEROS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIGNED-ZEROS-NEXT: v_cvt_i32_f32_e32 v0, v0
+; SIGNED-ZEROS-NEXT: v_cvt_f32_i32_e32 v0, v0
+; SIGNED-ZEROS-NEXT: s_setpc_b64 s[30:31]
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
+; NO-SIGNED-ZEROS: ; %bb.0: ; %entry
+; NO-SIGNED-ZEROS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NO-SIGNED-ZEROS-NEXT: v_trunc_f32_e32 v0, v0
+; NO-SIGNED-ZEROS-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %i = fptosi float %x to i32
+ %f = sitofp i32 %i to float
+ ret float %f
+}
+
+; For unsigned conversions, even when signed zeros are possible, we can still
+; use truncate because fabs is free.
+define float @test_unsigned_basic(float %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; SIGNED-ZEROS: ; %bb.0: ; %entry
+; SIGNED-ZEROS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIGNED-ZEROS-NEXT: v_trunc_f32_e64 v0, |v0|
+; SIGNED-ZEROS-NEXT: s_setpc_b64 s[30:31]
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; NO-SIGNED-ZEROS: ; %bb.0: ; %entry
+; NO-SIGNED-ZEROS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NO-SIGNED-ZEROS-NEXT: v_trunc_f32_e32 v0, v0
+; NO-SIGNED-ZEROS-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %i = fptoui float %x to i32
+ %f = uitofp i32 %i to float
+ ret float %f
+}
+
+; 16777217 is NOT exactly representable in f32.
+define float @test_inexact_16777217(float %x) {
+; CHECK-LABEL: test_inexact_16777217:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cvt_i32_f32_e32 v0, v0
+; CHECK-NEXT: v_min_i32_e32 v0, 0x1000001, v0
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %i = fptosi float %x to i32
+ %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
+ %f = sitofp i32 %clamped to float
+ ret float %f
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
More information about the llvm-commits
mailing list