[llvm] [DAGCombiner] Extend FP-to-Int cast without requiring nsz (PR #161093)
Yi-Chi Lee via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 28 20:19:59 PDT 2025
https://github.com/yichi170 updated https://github.com/llvm/llvm-project/pull/161093
>From 51bf419075bc7a1e2f8d1e109893c844a03a2ce6 Mon Sep 17 00:00:00 2001
From: Yi-Chi Lee <yichi170 at gmail.com>
Date: Sun, 28 Sep 2025 10:41:29 -0500
Subject: [PATCH 1/3] [DAGCombiner] Extend FP-to-Int cast without requiring nsz
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 ++++++++++++++++---
1 file changed, 31 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a6ba6e518899f..d3798eedc82a4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18869,20 +18869,45 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
// FIXME: We should be able to use node-level FMF here.
// TODO: If strict math, should we use FABS (+ range check for signed cast)?
EVT VT = N->getValueType(0);
- if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
- !DAG.getTarget().Options.NoSignedZerosFPMath)
+ if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
return SDValue();
// fptosi/fptoui round towards zero, so converting from FP to integer and
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
SDValue N0 = N->getOperand(0);
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
- N0.getOperand(0).getValueType() == VT)
- return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+ N0.getOperand(0).getValueType() == VT) {
+ if (DAG.getTarget().Options.NoSignedZerosFPMath)
+ return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+
+ unsigned IntWidth = N0.getValueSizeInBits();
+ APInt APMax = APInt::getSignedMaxValue(IntWidth);
+ APInt APMin = APInt::getSignedMinValue(IntWidth);
+
+ APFloat MaxAPF(VT.getFltSemantics());
+ MaxAPF.convertFromAPInt(APMax, true, APFloat::rmTowardZero);
+ APFloat MinAPF(VT.getFltSemantics());
+ MinAPF.convertFromAPInt(APMin, true, APFloat::rmTowardZero);
+
+ SDValue MaxFP = DAG.getConstantFP(MaxAPF, DL, VT);
+ SDValue MinFP = DAG.getConstantFP(MinAPF, DL, VT);
+
+ SDValue Clamped = DAG.getNode(ISD::FMINNUM, DL, VT,
+ DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP),
+ MaxFP);
+ return DAG.getNode(ISD::FTRUNC, DL, VT, Clamped);
+ }
if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
- N0.getOperand(0).getValueType() == VT)
- return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+ N0.getOperand(0).getValueType() == VT) {
+ if (DAG.getTarget().Options.NoSignedZerosFPMath)
+ return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+
+ if (TLI.isFAbsFree(VT)) {
+ SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
+ return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
+ }
+ }
return SDValue();
}
>From cccc07319d26ed660107eaa53bb363547ef09c43 Mon Sep 17 00:00:00 2001
From: Yi-Chi Lee <yichi170 at gmail.com>
Date: Sun, 28 Sep 2025 14:46:41 -0500
Subject: [PATCH 2/3] [DAGCombiner] Modify the comment to fit the current
implementation and apply clang-format
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 ++++++++++++-------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d3798eedc82a4..65cea64e0982d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18862,12 +18862,15 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const TargetLowering &TLI) {
- // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
- // replacing casts with a libcall. We also must be allowed to ignore -0.0
- // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
- // conversions would return +0.0.
+ // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
+ // If NoSignedZerosFPMath is enabled, this is a direct replacement.
+ // Otherwise, for strict math, we must handle edge cases:
+ // 1. For signed conversions, clamp out-of-range values to the valid
+ // integer range before the trunc.
+ // 2. For unsigned conversions, use FABS. A negative float becomes integer 0,
+ // which must convert back to +0.0. FTRUNC on its own could produce -0.0.
+
// FIXME: We should be able to use node-level FMF here.
- // TODO: If strict math, should we use FABS (+ range check for signed cast)?
EVT VT = N->getValueType(0);
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
return SDValue();
@@ -18880,6 +18883,7 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
if (DAG.getTarget().Options.NoSignedZerosFPMath)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+ // Strict math: clamp to the signed integer range before truncating.
unsigned IntWidth = N0.getValueSizeInBits();
APInt APMax = APInt::getSignedMaxValue(IntWidth);
APInt APMin = APInt::getSignedMinValue(IntWidth);
@@ -18892,9 +18896,9 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
SDValue MaxFP = DAG.getConstantFP(MaxAPF, DL, VT);
SDValue MinFP = DAG.getConstantFP(MinAPF, DL, VT);
- SDValue Clamped = DAG.getNode(ISD::FMINNUM, DL, VT,
- DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP),
- MaxFP);
+ SDValue Clamped = DAG.getNode(
+ ISD::FMINNUM, DL, VT,
+ DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP), MaxFP);
return DAG.getNode(ISD::FTRUNC, DL, VT, Clamped);
}
@@ -18903,6 +18907,7 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
if (DAG.getTarget().Options.NoSignedZerosFPMath)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+ // Strict math: use FABS to handle negative inputs correctly.
if (TLI.isFAbsFree(VT)) {
SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
>From 37413f35756e67d43d5b3f54c0b0e94670cca5cc Mon Sep 17 00:00:00 2001
From: Yi-Chi Lee <yichi170 at gmail.com>
Date: Sun, 28 Sep 2025 22:19:06 -0500
Subject: [PATCH 3/3] update testcases
---
llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll | 49 ++++++++--
.../sve-streaming-mode-cvt-fp-int-fp.ll | 98 ++++++++++++++-----
.../amdgpu-simplify-libcall-pow-codegen.ll | 37 +++----
3 files changed, 133 insertions(+), 51 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
index 1207de746894b..f68188fdf54ce 100644
--- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
@@ -4,8 +4,13 @@
define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs d0, d0
-; CHECK-NEXT: scvtf d0, d0
+; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT: fmaxnm d0, d0, d1
+; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: fminnm d0, d0, d1
+; CHECK-NEXT: frintz d0, d0
; CHECK-NEXT: ret
entry:
%conv = fptosi double %x to i64
@@ -16,8 +21,12 @@ entry:
define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs s0, s0
-; CHECK-NEXT: scvtf s0, s0
+; CHECK-NEXT: movi v1.2s, #207, lsl #24
+; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT: fmaxnm s0, s0, s1
+; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: fminnm s0, s0, s1
+; CHECK-NEXT: frintz s0, s0
; CHECK-NEXT: ret
entry:
%conv = fptosi float %x to i32
@@ -28,8 +37,13 @@ entry:
define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0
-; CHECK-NEXT: scvtf h0, h0
+; CHECK-NEXT: mov w8, #64511 // =0xfbff
+; CHECK-NEXT: fmov h1, w8
+; CHECK-NEXT: mov w8, #31743 // =0x7bff
+; CHECK-NEXT: fmaxnm h0, h0, h1
+; CHECK-NEXT: fmov h1, w8
+; CHECK-NEXT: fminnm h0, h0, h1
+; CHECK-NEXT: frintz h0, h0
; CHECK-NEXT: ret
entry:
%conv = fptosi half %x to i32
@@ -170,8 +184,14 @@ entry:
define i64 @tests_f64_multiuse(double %x) {
; CHECK-LABEL: tests_f64_multiuse:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: scvtf d1, x8
+; CHECK-NEXT: fmaxnm d1, d0, d1
+; CHECK-NEXT: fminnm d1, d1, d2
+; CHECK-NEXT: frintz d1, d1
; CHECK-NEXT: fcmp d0, d1
; CHECK-NEXT: csel x0, x8, xzr, eq
; CHECK-NEXT: ret
@@ -186,8 +206,13 @@ entry:
define i32 @tests_f32_multiuse(float %x) {
; CHECK-LABEL: tests_f32_multiuse:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v1.2s, #207, lsl #24
+; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: fcvtzs w8, s0
-; CHECK-NEXT: scvtf s1, w8
+; CHECK-NEXT: fmaxnm s1, s0, s1
+; CHECK-NEXT: fminnm s1, s1, s2
+; CHECK-NEXT: frintz s1, s1
; CHECK-NEXT: fcmp s0, s1
; CHECK-NEXT: csel w0, w8, wzr, eq
; CHECK-NEXT: ret
@@ -202,8 +227,14 @@ entry:
define i32 @tests_f16_multiuse(half %x) {
; CHECK-LABEL: tests_f16_multiuse:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #64511 // =0xfbff
+; CHECK-NEXT: fmov h1, w8
+; CHECK-NEXT: mov w8, #31743 // =0x7bff
+; CHECK-NEXT: fmov h2, w8
; CHECK-NEXT: fcvtzs w8, h0
-; CHECK-NEXT: scvtf h1, w8
+; CHECK-NEXT: fmaxnm h1, h0, h1
+; CHECK-NEXT: fminnm h1, h1, h2
+; CHECK-NEXT: frintz h1, h1
; CHECK-NEXT: fcmp h0, h1
; CHECK-NEXT: csel w0, w8, wzr, eq
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index f4ae66a3b2259..bed62a428939e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -9,23 +9,35 @@ target triple = "aarch64-unknown-linux-gnu"
define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT: fmaxnm d0, d0, d1
+; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: fminnm d0, d0, d1
+; CHECK-NEXT: frintz d0, d0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t1:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
-; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0
-; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0
+; USE-NEON-NO-GPRS-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; USE-NEON-NO-GPRS-NEXT: fmov d1, x8
+; USE-NEON-NO-GPRS-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; USE-NEON-NO-GPRS-NEXT: fmaxnm d0, d0, d1
+; USE-NEON-NO-GPRS-NEXT: fmov d1, x8
+; USE-NEON-NO-GPRS-NEXT: fminnm d0, d0, d1
+; USE-NEON-NO-GPRS-NEXT: frintz d0, d0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t1:
; NONEON-NOSVE: // %bb.0: // %entry
-; NONEON-NOSVE-NEXT: fcvtzs x8, d0
-; NONEON-NOSVE-NEXT: scvtf d0, x8
+; NONEON-NOSVE-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; NONEON-NOSVE-NEXT: fmov d1, x8
+; NONEON-NOSVE-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT: fmov d1, x8
+; NONEON-NOSVE-NEXT: fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT: frintz d0, d0
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi double %x to i64
@@ -36,23 +48,35 @@ entry:
define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
-; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
-; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000
+; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT: fmaxnm s0, s0, s1
+; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: fminnm s0, s0, s1
+; CHECK-NEXT: frintz s0, s0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t2:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
-; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0
-; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
+; USE-NEON-NO-GPRS-NEXT: mov w8, #-822083584 // =0xcf000000
+; USE-NEON-NO-GPRS-NEXT: fmov s1, w8
+; USE-NEON-NO-GPRS-NEXT: mov w8, #1325400063 // =0x4effffff
+; USE-NEON-NO-GPRS-NEXT: fmaxnm s0, s0, s1
+; USE-NEON-NO-GPRS-NEXT: fmov s1, w8
+; USE-NEON-NO-GPRS-NEXT: fminnm s0, s0, s1
+; USE-NEON-NO-GPRS-NEXT: frintz s0, s0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t2:
; NONEON-NOSVE: // %bb.0: // %entry
-; NONEON-NOSVE-NEXT: fcvtzs w8, s0
-; NONEON-NOSVE-NEXT: scvtf s0, w8
+; NONEON-NOSVE-NEXT: mov w8, #-822083584 // =0xcf000000
+; NONEON-NOSVE-NEXT: fmov s1, w8
+; NONEON-NOSVE-NEXT: mov w8, #1325400063 // =0x4effffff
+; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT: fmov s1, w8
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT: frintz s0, s0
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi float %x to i32
@@ -63,18 +87,36 @@ entry:
define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
-; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
-; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
-; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: adrp x8, .LCPI2_1
+; CHECK-NEXT: fmaxnm h0, h0, h1
+; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI2_1]
+; CHECK-NEXT: fminnm h0, h0, h1
+; CHECK-NEXT: frintz h0, h0
; CHECK-NEXT: ret
;
+; USE-NEON-NO-GPRS-LABEL: t3:
+; USE-NEON-NO-GPRS: // %bb.0: // %entry
+; USE-NEON-NO-GPRS-NEXT: adrp x8, .LCPI2_0
+; USE-NEON-NO-GPRS-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
+; USE-NEON-NO-GPRS-NEXT: adrp x8, .LCPI2_1
+; USE-NEON-NO-GPRS-NEXT: fmaxnm h0, h0, h1
+; USE-NEON-NO-GPRS-NEXT: ldr h1, [x8, :lo12:.LCPI2_1]
+; USE-NEON-NO-GPRS-NEXT: fminnm h0, h0, h1
+; USE-NEON-NO-GPRS-NEXT: frintz h0, h0
+; USE-NEON-NO-GPRS-NEXT: ret
+;
; NONEON-NOSVE-LABEL: t3:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fcvtzs w8, s0
-; NONEON-NOSVE-NEXT: scvtf s0, w8
+; NONEON-NOSVE-NEXT: mov w8, #-822083584 // =0xcf000000
+; NONEON-NOSVE-NEXT: fmov s1, w8
+; NONEON-NOSVE-NEXT: mov w8, #1325400063 // =0x4effffff
+; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT: fmov s1, w8
+; NONEON-NOSVE-NEXT: fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT: frintz s0, s0
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: ret
entry:
@@ -147,6 +189,12 @@ define half @t6(half %x) {
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
+; USE-NEON-NO-GPRS-LABEL: t6:
+; USE-NEON-NO-GPRS: // %bb.0: // %entry
+; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0
+; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0
+; USE-NEON-NO-GPRS-NEXT: ret
+;
; NONEON-NOSVE-LABEL: t6:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 0329f23ea434f..85f99698951b2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -60,15 +60,16 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
; CHECK-NEXT: v_log_f16_e64 v3, |v0|
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
-; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1
-; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1
-; CHECK-NEXT: v_and_b32_e32 v0, v1, v0
-; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
-; CHECK-NEXT: v_exp_f16_e32 v2, v2
-; CHECK-NEXT: v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT: v_cvt_f32_f16_e32 v2, v1
+; CHECK-NEXT: v_max_f16_e32 v1, 0xfbff, v1
+; CHECK-NEXT: v_min_f16_e32 v1, 0x7bff, v1
+; CHECK-NEXT: v_trunc_f16_e32 v1, v1
+; CHECK-NEXT: v_cvt_i32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_f16_e32 v1, v3, v1
+; CHECK-NEXT: v_exp_f16_e32 v1, v1
+; CHECK-NEXT: v_lshlrev_b16_e32 v2, 15, v2
+; CHECK-NEXT: v_and_b32_e32 v0, v2, v0
+; CHECK-NEXT: v_or_b32_e32 v0, v0, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to half
%pow = tail call fast half @_Z3powDhDh(half %x, half %y)
@@ -79,28 +80,30 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f32__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
; CHECK-NEXT: s_mov_b32 s4, 0x800000
; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc
-; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3
; CHECK-NEXT: v_log_f32_e32 v3, v3
+; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
-; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT: v_max_f32_e32 v3, 0xcf000000, v1
+; CHECK-NEXT: v_min_f32_e32 v3, 0x4effffff, v3
+; CHECK-NEXT: v_trunc_f32_e32 v3, v3
+; CHECK-NEXT: v_mul_f32_e32 v4, v2, v3
; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4
; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000
-; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
+; CHECK-NEXT: v_fma_f32 v2, v2, v3, v4
; CHECK-NEXT: v_exp_f32_e32 v2, v2
+; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
; CHECK-NEXT: v_not_b32_e32 v3, 63
; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; CHECK-NEXT: v_ldexp_f32 v2, v2, v3
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
More information about the llvm-commits
mailing list