[llvm] [DAGCombiner] Extend fp->int->fp optimizations to include clamping (PR #164502)

Sun Dec 14 03:24:32 PST 2025

https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/164502

>From 110a2d1c38047f1f82ebc7316d9fb7ad1c9b8945 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 21 Oct 2025 23:41:43 +0300
Subject: [PATCH 1/2] [DAGCombiner] Extend fp->int->fp optimizations to include
 clamping

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  79 +++++++---
 llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll  | 140 ++++++++++++++++++
 2 files changed, 201 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 77346cb7bfac0..ad75d2cf5a8ac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntervalMap.h"
@@ -19040,6 +19041,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
 static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
   // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
+  // Additionally, if there are clamps ([us]min or [us]max) around
+  // the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc.
   // If NoSignedZerosFPMath is enabled, this is a direct replacement.
   // Otherwise, for strict math, we must handle edge cases:
   // 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
@@ -19051,28 +19054,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
     return SDValue();
 
-  // fptosi/fptoui round towards zero, so converting from FP to integer and
-  // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
-  SDValue N0 = N->getOperand(0);
-  if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
-      N0.getOperand(0).getValueType() == VT) {
-    if (DAG.getTarget().Options.NoSignedZerosFPMath)
-      return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
-  }
+  bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP;
+  bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  assert(IsSigned || IsUnsigned);
 
-  if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
-      N0.getOperand(0).getValueType() == VT) {
-    if (DAG.getTarget().Options.NoSignedZerosFPMath)
-      return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+  bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
+  // For signed conversions: The optimization changes signed zero behavior.
+  if (IsSigned && !IsSignedZeroSafe)
+    return SDValue();
+  // For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
+  // (unless NoSignedZerosFPMath is set).
+  if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
+    return SDValue();
 
-    // Strict math: use FABS to handle negative inputs correctly.
-    if (TLI.isFAbsFree(VT)) {
-      SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
-      return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
-    }
+  // Collect potential clamp operations (innermost to outermost) and peel.
+  struct ClampOp {
+    unsigned Opcode;
+    SDValue Constant;
+  };
+  SmallVector<ClampOp, 2> Clamps;
+  unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+  unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
+  SDValue IntVal = N->getOperand(0);
+  constexpr unsigned MaxClampLevels = 2;
+  for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
+    if (!IntVal.hasOneUse() ||
+        (IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
+      break;
+    unsigned FPClampOp =
+        (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (!TLI.isOperationLegal(FPClampOp, VT))
+      return SDValue();
+    auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
+    if (!IntConstNode)
+      return SDValue();
+    APFloat FPConst(VT.getFltSemantics());
+    APInt IntConst = IntConstNode->getAPIntValue();
+    FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
+    // Verify roundtrip exactness.
+    APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
+    bool IsExact;
+    if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) !=
+            APFloat::opOK ||
+        !IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
+      return SDValue();
+    Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
+    IntVal = IntVal.getOperand(0);
   }
 
-  return SDValue();
+  // Check that the sequence ends with a FPTo[us]i of the right type.
+  unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
+  if (IntVal.getOpcode() != FPToIntOp ||
+      IntVal.getOperand(0).getValueType() != VT)
+    return SDValue();
+
+  SDValue Result = IntVal.getOperand(0);
+  if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT))
+    Result = DAG.getNode(ISD::FABS, DL, VT, Result);
+  Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
+  // Apply clamps, if any, in reverse order (innermost first).
+  for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
+    Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
+  return Result;
 }
 
 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
new file mode 100644
index 0000000000000..9a8c555953611
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
+
+; Test folding of float->int->float roundtrips into float-only operations.
+; The optimization could converts patterns like:
+;   sitofp(fptosi(x)) -> ftrunc(x)
+;   sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C)
+; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
+
+define float @test_signed_basic(float %x) {
+; CHECK-LABEL: test_signed_basic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi float %x to i32
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+define float @test_unsigned_basic(float %x) {
+; CHECK-LABEL: test_unsigned_basic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui float %x to i32
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+define float @test_signed_min_max(float %x) {
+; CHECK-LABEL: test_signed_min_max:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w9, s0
+; CHECK-NEXT:    mov w8, #-512 // =0xfffffe00
+; CHECK-NEXT:    cmn w9, #512
+; CHECK-NEXT:    csel w8, w9, w8, gt
+; CHECK-NEXT:    mov w9, #1023 // =0x3ff
+; CHECK-NEXT:    cmp w8, #1023
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    movi v1.2s, #196, lsl #24
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT:    movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT:    fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi float %x to i32
+  %lower = call i32 @llvm.smax.i32(i32 %i, i32 -512)
+  %clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023)
+  %f = sitofp i32 %clamped to float
+  ret float %f
+}
+
+define float @test_unsigned_min_max(float %x) {
+; CHECK-LABEL: test_unsigned_min_max:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w9, s0
+; CHECK-NEXT:    mov w8, #512 // =0x200
+; CHECK-NEXT:    cmp w9, #512
+; CHECK-NEXT:    csel w8, w9, w8, hi
+; CHECK-NEXT:    mov w9, #1023 // =0x3ff
+; CHECK-NEXT:    cmp w8, #1023
+; CHECK-NEXT:    csel w8, w8, w9, lo
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    movi v1.2s, #68, lsl #24
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT:    movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT:    fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui float %x to i32
+  %lower = call i32 @llvm.umax.i32(i32 %i, i32 512)
+  %clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023)
+  %f = uitofp i32 %clamped to float
+  ret float %f
+}
+
+; 16777217 is NOT exactly representable in f32.
+define float @test_inexact_16777217(float %x) {
+; CHECK-LABEL: test_inexact_16777217:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    mov w9, #16777216 // =0x1000000
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    movk w9, #256, lsl #16
+; CHECK-NEXT:    csel w8, w8, w9, le
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    fcvtzs w8, s0
+; NO-SIGNED-ZEROS-NEXT:    mov w9, #16777216 // =0x1000000
+; NO-SIGNED-ZEROS-NEXT:    cmp w8, w9
+; NO-SIGNED-ZEROS-NEXT:    mov w9, #1 // =0x1
+; NO-SIGNED-ZEROS-NEXT:    movk w9, #256, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    csel w8, w8, w9, le
+; NO-SIGNED-ZEROS-NEXT:    scvtf s0, w8
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi float %x to i32
+  %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
+  %f = sitofp i32 %clamped to float
+  ret float %f
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)

>From 561bded0d1d8662488594dbd191ff0135b45594c Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 21 Oct 2025 23:41:43 +0300
Subject: [PATCH 2/2] Address comments

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  68 +++---
 llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll  | 210 ++++++++++++++----
 llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll   |  62 ++++++
 3 files changed, 270 insertions(+), 70 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ad75d2cf5a8ac..f78b38aa4c57b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6579,12 +6579,25 @@ static bool arebothOperandsNotNan(SDValue Operand1, SDValue Operand2,
   return DAG.isKnownNeverNaN(Operand2) && DAG.isKnownNeverNaN(Operand1);
 }
 
+/// Returns an appropriate FP min/max opcode for clamping operations.
+static unsigned getMinMaxOpcodeForClamp(bool IsMin, SDValue Operand1,
+                                        SDValue Operand2, SelectionDAG &DAG,
+                                        const TargetLowering &TLI) {
+  EVT VT = Operand1.getValueType();
+  unsigned IEEEOp = IsMin ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+  if (TLI.isOperationLegalOrCustom(IEEEOp, VT) &&
+      arebothOperandsNotNan(Operand1, Operand2, DAG))
+    return IEEEOp;
+  unsigned PreferredOp = IsMin ? ISD::FMINNUM : ISD::FMAXNUM;
+  if (TLI.isOperationLegalOrCustom(PreferredOp, VT))
+    return PreferredOp;
+  return ISD::DELETED_NODE;
+}
+
 // FIXME: use FMINIMUMNUM if possible, such as for RISC-V.
-static unsigned getMinMaxOpcodeForFP(SDValue Operand1, SDValue Operand2,
-                                     ISD::CondCode CC, unsigned OrAndOpcode,
-                                     SelectionDAG &DAG,
-                                     bool isFMAXNUMFMINNUM_IEEE,
-                                     bool isFMAXNUMFMINNUM) {
+static unsigned getMinMaxOpcodeForCompareFold(
+    SDValue Operand1, SDValue Operand2, ISD::CondCode CC, unsigned OrAndOpcode,
+    SelectionDAG &DAG, bool isFMAXNUMFMINNUM_IEEE, bool isFMAXNUMFMINNUM) {
   // The optimization cannot be applied for all the predicates because
   // of the way FMINNUM/FMAXNUM and FMINNUM_IEEE/FMAXNUM_IEEE handle
   // NaNs. For FMINNUM_IEEE/FMAXNUM_IEEE, the optimization cannot be
@@ -6742,9 +6755,9 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) {
         else
           NewOpcode = IsSigned ? ISD::SMAX : ISD::UMAX;
       } else if (OpVT.isFloatingPoint())
-        NewOpcode =
-            getMinMaxOpcodeForFP(Operand1, Operand2, CC, LogicOp->getOpcode(),
-                                 DAG, isFMAXNUMFMINNUM_IEEE, isFMAXNUMFMINNUM);
+        NewOpcode = getMinMaxOpcodeForCompareFold(
+            Operand1, Operand2, CC, LogicOp->getOpcode(), DAG,
+            isFMAXNUMFMINNUM_IEEE, isFMAXNUMFMINNUM);
 
       if (NewOpcode != ISD::DELETED_NODE) {
         SDValue MinMaxValue =
@@ -19067,29 +19080,28 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
     return SDValue();
 
-  // Collect potential clamp operations (innermost to outermost) and peel.
-  struct ClampOp {
-    unsigned Opcode;
+  // Collect potential clamp operations (outermost to innermost) and peel.
+  struct ClampInfo {
+    bool IsMin;
     SDValue Constant;
   };
-  SmallVector<ClampOp, 2> Clamps;
+  constexpr unsigned MaxClamps = 2;
+  SmallVector<ClampInfo, MaxClamps> Clamps;
   unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
   unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
   SDValue IntVal = N->getOperand(0);
-  constexpr unsigned MaxClampLevels = 2;
-  for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
+  for (unsigned Level = 0; Level < MaxClamps; ++Level) {
     if (!IntVal.hasOneUse() ||
         (IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
       break;
-    unsigned FPClampOp =
-        (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
-    if (!TLI.isOperationLegal(FPClampOp, VT))
-      return SDValue();
-    auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
-    if (!IntConstNode)
+    SDValue RHS = IntVal.getOperand(1);
+    APInt IntConst;
+    if (auto *IntConstNode = dyn_cast<ConstantSDNode>(RHS))
+      IntConst = IntConstNode->getAPIntValue();
+    else if (!ISD::isConstantSplatVector(RHS.getNode(),
+                                         IntConst))
       return SDValue();
     APFloat FPConst(VT.getFltSemantics());
-    APInt IntConst = IntConstNode->getAPIntValue();
     FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
     // Verify roundtrip exactness.
     APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
@@ -19098,11 +19110,12 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
             APFloat::opOK ||
         !IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
       return SDValue();
-    Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
+    bool IsMin = IntVal.getOpcode() == MinOp;
+    Clamps.push_back({IsMin, DAG.getConstantFP(FPConst, DL, VT)});
     IntVal = IntVal.getOperand(0);
   }
 
-  // Check that the sequence ends with a FPTo[us]i of the right type.
+  // Check that the sequence ends with the correct kind of fpto[us]i.
   unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
   if (IntVal.getOpcode() != FPToIntOp ||
       IntVal.getOperand(0).getValueType() != VT)
@@ -19113,8 +19126,13 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
     Result = DAG.getNode(ISD::FABS, DL, VT, Result);
   Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
   // Apply clamps, if any, in reverse order (innermost first).
-  for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
-    Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
+  for (const ClampInfo &Clamp : reverse(Clamps)) {
+    unsigned FPClampOp =
+        getMinMaxOpcodeForClamp(Clamp.IsMin, Result, Clamp.Constant, DAG, TLI);
+    if (FPClampOp == ISD::DELETED_NODE)
+      return SDValue();
+    Result = DAG.getNode(FPClampOp, DL, VT, Result, Clamp.Constant);
+  }
   return Result;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
index 9a8c555953611..829782ceb9842 100644
--- a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,SIGNED-ZEROS
+; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NO-SIGNED-ZEROS
 
 ; Test folding of float->int->float roundtrips into float-only operations.
 ; The optimization could converts patterns like:
@@ -9,11 +9,11 @@
 ; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
 
 define float @test_signed_basic(float %x) {
-; CHECK-LABEL: test_signed_basic:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs s0, s0
-; CHECK-NEXT:    scvtf s0, s0
-; CHECK-NEXT:    ret
+; SIGNED-ZEROS-LABEL: test_signed_basic:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzs s0, s0
+; SIGNED-ZEROS-NEXT:    scvtf s0, s0
+; SIGNED-ZEROS-NEXT:    ret
 ;
 ; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
 ; NO-SIGNED-ZEROS:       // %bb.0: // %entry
@@ -26,11 +26,11 @@ entry:
 }
 
 define float @test_unsigned_basic(float %x) {
-; CHECK-LABEL: test_unsigned_basic:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu s0, s0
-; CHECK-NEXT:    ucvtf s0, s0
-; CHECK-NEXT:    ret
+; SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzu s0, s0
+; SIGNED-ZEROS-NEXT:    ucvtf s0, s0
+; SIGNED-ZEROS-NEXT:    ret
 ;
 ; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
 ; NO-SIGNED-ZEROS:       // %bb.0: // %entry
@@ -43,17 +43,17 @@ entry:
 }
 
 define float @test_signed_min_max(float %x) {
-; CHECK-LABEL: test_signed_min_max:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs w9, s0
-; CHECK-NEXT:    mov w8, #-512 // =0xfffffe00
-; CHECK-NEXT:    cmn w9, #512
-; CHECK-NEXT:    csel w8, w9, w8, gt
-; CHECK-NEXT:    mov w9, #1023 // =0x3ff
-; CHECK-NEXT:    cmp w8, #1023
-; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    scvtf s0, w8
-; CHECK-NEXT:    ret
+; SIGNED-ZEROS-LABEL: test_signed_min_max:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzs w9, s0
+; SIGNED-ZEROS-NEXT:    mov w8, #-512 // =0xfffffe00
+; SIGNED-ZEROS-NEXT:    cmn w9, #512
+; SIGNED-ZEROS-NEXT:    csel w8, w9, w8, gt
+; SIGNED-ZEROS-NEXT:    mov w9, #1023 // =0x3ff
+; SIGNED-ZEROS-NEXT:    cmp w8, #1023
+; SIGNED-ZEROS-NEXT:    csel w8, w8, w9, lt
+; SIGNED-ZEROS-NEXT:    scvtf s0, w8
+; SIGNED-ZEROS-NEXT:    ret
 ;
 ; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
 ; NO-SIGNED-ZEROS:       // %bb.0: // %entry
@@ -74,17 +74,17 @@ entry:
 }
 
 define float @test_unsigned_min_max(float %x) {
-; CHECK-LABEL: test_unsigned_min_max:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu w9, s0
-; CHECK-NEXT:    mov w8, #512 // =0x200
-; CHECK-NEXT:    cmp w9, #512
-; CHECK-NEXT:    csel w8, w9, w8, hi
-; CHECK-NEXT:    mov w9, #1023 // =0x3ff
-; CHECK-NEXT:    cmp w8, #1023
-; CHECK-NEXT:    csel w8, w8, w9, lo
-; CHECK-NEXT:    ucvtf s0, w8
-; CHECK-NEXT:    ret
+; SIGNED-ZEROS-LABEL: test_unsigned_min_max:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzu w9, s0
+; SIGNED-ZEROS-NEXT:    mov w8, #512 // =0x200
+; SIGNED-ZEROS-NEXT:    cmp w9, #512
+; SIGNED-ZEROS-NEXT:    csel w8, w9, w8, hi
+; SIGNED-ZEROS-NEXT:    mov w9, #1023 // =0x3ff
+; SIGNED-ZEROS-NEXT:    cmp w8, #1023
+; SIGNED-ZEROS-NEXT:    csel w8, w8, w9, lo
+; SIGNED-ZEROS-NEXT:    ucvtf s0, w8
+; SIGNED-ZEROS-NEXT:    ret
 ;
 ; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
 ; NO-SIGNED-ZEROS:       // %bb.0: // %entry
@@ -116,17 +116,6 @@ define float @test_inexact_16777217(float %x) {
 ; CHECK-NEXT:    csel w8, w8, w9, le
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
-;
-; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
-; NO-SIGNED-ZEROS:       // %bb.0: // %entry
-; NO-SIGNED-ZEROS-NEXT:    fcvtzs w8, s0
-; NO-SIGNED-ZEROS-NEXT:    mov w9, #16777216 // =0x1000000
-; NO-SIGNED-ZEROS-NEXT:    cmp w8, w9
-; NO-SIGNED-ZEROS-NEXT:    mov w9, #1 // =0x1
-; NO-SIGNED-ZEROS-NEXT:    movk w9, #256, lsl #16
-; NO-SIGNED-ZEROS-NEXT:    csel w8, w8, w9, le
-; NO-SIGNED-ZEROS-NEXT:    scvtf s0, w8
-; NO-SIGNED-ZEROS-NEXT:    ret
 entry:
   %i = fptosi float %x to i32
   %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
@@ -134,7 +123,138 @@ entry:
   ret float %f
 }
 
+define <4 x float> @test_signed_v4f32(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_signed_v4f32:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzs v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    scvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_v4f32:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi <4 x float> %x to <4 x i32>
+  %f = sitofp <4 x i32> %i to <4 x float>
+  ret <4 x float> %f
+}
+
+define <4 x float> @test_unsigned_v4f32(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_v4f32:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzu v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    ucvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_v4f32:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui <4 x float> %x to <4 x i32>
+  %f = uitofp <4 x i32> %i to <4 x float>
+  ret <4 x float> %f
+}
+
+define <2 x double> @test_signed_v2f64(<2 x double> %x) {
+; SIGNED-ZEROS-LABEL: test_signed_v2f64:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzs v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT:    scvtf v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_v2f64:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz v0.2d, v0.2d
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi <2 x double> %x to <2 x i64>
+  %f = sitofp <2 x i64> %i to <2 x double>
+  ret <2 x double> %f
+}
+
+define <2 x double> @test_unsigned_v2f64(<2 x double> %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_v2f64:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzu v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT:    ucvtf v0.2d, v0.2d
+; SIGNED-ZEROS-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_v2f64:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz v0.2d, v0.2d
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui <2 x double> %x to <2 x i64>
+  %f = uitofp <2 x i64> %i to <2 x double>
+  ret <2 x double> %f
+}
+
+define <4 x float> @test_signed_v4f32_min_max(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_signed_v4f32_min_max:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    fcvtzs v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    mvni v1.4s, #1, msl #8
+; SIGNED-ZEROS-NEXT:    smax v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT:    movi v1.4s, #3, msl #8
+; SIGNED-ZEROS-NEXT:    smin v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT:    scvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_v4f32_min_max:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    movi v1.4s, #196, lsl #24
+; NO-SIGNED-ZEROS-NEXT:    frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT:    mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT:    movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT:    dup v1.4s, w8
+; NO-SIGNED-ZEROS-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi <4 x float> %x to <4 x i32>
+  %lower = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %i, <4 x i32> splat (i32 -512))
+  %clamped = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %lower, <4 x i32> splat (i32 1023))
+  %f = sitofp <4 x i32> %clamped to <4 x float>
+  ret <4 x float> %f
+}
+
+define <4 x float> @test_unsigned_v4f32_min_max(<4 x float> %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_v4f32_min_max:
+; SIGNED-ZEROS:       // %bb.0: // %entry
+; SIGNED-ZEROS-NEXT:    movi v1.4s, #2, lsl #8
+; SIGNED-ZEROS-NEXT:    fcvtzu v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    umax v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT:    movi v1.4s, #3, msl #8
+; SIGNED-ZEROS-NEXT:    umin v0.4s, v0.4s, v1.4s
+; SIGNED-ZEROS-NEXT:    ucvtf v0.4s, v0.4s
+; SIGNED-ZEROS-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_v4f32_min_max:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    movi v1.4s, #68, lsl #24
+; NO-SIGNED-ZEROS-NEXT:    frintz v0.4s, v0.4s
+; NO-SIGNED-ZEROS-NEXT:    mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT:    movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT:    dup v1.4s, w8
+; NO-SIGNED-ZEROS-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui <4 x float> %x to <4 x i32>
+  %lower = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %i, <4 x i32> splat (i32 512))
+  %clamped = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %lower, <4 x i32> splat (i32 1023))
+  %f = uitofp <4 x i32> %clamped to <4 x float>
+  ret <4 x float> %f
+}
+
+
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.umin.i32(i32, i32)
 declare i32 @llvm.umax.i32(i32, i32)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll
new file mode 100644
index 0000000000000..2416d6a852eb9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-to-int-to-fp.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s --check-prefixes=CHECK,SIGNED-ZEROS
+; RUN: llc -mtriple=amdgcn --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NO-SIGNED-ZEROS
+
+; Test folding of float->int->float roundtrips into float-only operations.
+
+define float @test_signed_basic(float %x) {
+; SIGNED-ZEROS-LABEL: test_signed_basic:
+; SIGNED-ZEROS:       ; %bb.0: ; %entry
+; SIGNED-ZEROS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIGNED-ZEROS-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SIGNED-ZEROS-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; SIGNED-ZEROS-NEXT:    s_setpc_b64 s[30:31]
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
+; NO-SIGNED-ZEROS:       ; %bb.0: ; %entry
+; NO-SIGNED-ZEROS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NO-SIGNED-ZEROS-NEXT:    v_trunc_f32_e32 v0, v0
+; NO-SIGNED-ZEROS-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %i = fptosi float %x to i32
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+; For unsigned conversions, even when signed zeros are possible, we can still
+; use truncate because fabs is free.
+define float @test_unsigned_basic(float %x) {
+; SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; SIGNED-ZEROS:       ; %bb.0: ; %entry
+; SIGNED-ZEROS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIGNED-ZEROS-NEXT:    v_trunc_f32_e64 v0, |v0|
+; SIGNED-ZEROS-NEXT:    s_setpc_b64 s[30:31]
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; NO-SIGNED-ZEROS:       ; %bb.0: ; %entry
+; NO-SIGNED-ZEROS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NO-SIGNED-ZEROS-NEXT:    v_trunc_f32_e32 v0, v0
+; NO-SIGNED-ZEROS-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %i = fptoui float %x to i32
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+; 16777217 is NOT exactly representable in f32.
+define float @test_inexact_16777217(float %x) {
+; CHECK-LABEL: test_inexact_16777217:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; CHECK-NEXT:    v_min_i32_e32 v0, 0x1000001, v0
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %i = fptosi float %x to i32
+  %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
+  %f = sitofp i32 %clamped to float
+  ret float %f
+}
+
+declare i32 @llvm.smin.i32(i32, i32)