[llvm] r331992 - [x86] fix fmaxnum/fminnum with nnan
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Thu May 10 08:40:49 PDT 2018
Author: spatel
Date: Thu May 10 08:40:49 2018
New Revision: 331992
URL: http://llvm.org/viewvc/llvm-project?rev=331992&view=rev
Log:
[x86] fix fmaxnum/fminnum with nnan
With nnan, there's no need for the masked merge / blend
sequence (that probably costs much more than the min/max
instruction).
Somewhere between clang 5.0 and 6.0, we started producing
these intrinsics for fmax()/fmin() in C source instead of
libcalls or fcmp/select. The backend wasn't prepared for
that, so we regressed perf in those cases.
Note: it's possible that other targets have similar problems
as seen here.
Noticed while investigating PR37403 and related bugs:
https://bugs.llvm.org/show_bug.cgi?id=37403
The IR FMF propagation cases still don't work. There's
a proposal that might fix those cases in D46563.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/fmaxnum.ll
llvm/trunk/test/CodeGen/X86/fminnum.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=331992&r1=331991&r2=331992&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu May 10 08:40:49 2018
@@ -36364,8 +36364,6 @@ static SDValue combineFMinNumFMaxNum(SDN
if (Subtarget.useSoftFloat())
return SDValue();
- // TODO: Check for global or instruction-level "nnan". In that case, we
- // should be able to lower to FMAX/FMIN alone.
// TODO: If an operand is already known to be a NaN or not a NaN, this
// should be an optional swap and FMAX/FMIN.
@@ -36375,14 +36373,21 @@ static SDValue combineFMinNumFMaxNum(SDN
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
return SDValue();
- // This takes at least 3 instructions, so favor a library call when operating
- // on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
- return SDValue();
-
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+ // If we don't have to respect NaN inputs, this is a direct translation to x86
+ // min/max instructions.
+ if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+ // If we have to respect NaN inputs, this takes at least 3 instructions.
+ // Favor a library call when operating on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+ return SDValue();
+
EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
DAG.getDataLayout(), *DAG.getContext(), VT);
@@ -36405,9 +36410,8 @@ static SDValue combineFMinNumFMaxNum(SDN
// use those instructions for fmaxnum by selecting away a NaN input.
// If either operand is NaN, the 2nd source operand (Op0) is passed through.
- auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
- SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
Modified: llvm/trunk/test/CodeGen/X86/fmaxnum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fmaxnum.ll?rev=331992&r1=331991&r2=331992&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fmaxnum.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fmaxnum.ll Thu May 10 08:40:49 2018
@@ -285,7 +285,7 @@ define <8 x double> @test_intrinsic_fmax
ret <8 x double> %z
}
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
@@ -333,49 +333,33 @@ define <4 x float> @maxnum_intrinsic_nna
ret <4 x float> %r
}
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {
; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpunordss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: maxss %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: maxss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call float @llvm.maxnum.f32(float %a, float %b)
ret float %r
}
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 {
; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64:
; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm1, %xmm2
-; SSE-NEXT: maxpd %xmm0, %xmm2
-; SSE-NEXT: cmpunordpd %xmm0, %xmm0
-; SSE-NEXT: andpd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm2, %xmm0
-; SSE-NEXT: orpd %xmm1, %xmm0
+; SSE-NEXT: maxpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %r
Modified: llvm/trunk/test/CodeGen/X86/fminnum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fminnum.ll?rev=331992&r1=331991&r2=331992&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fminnum.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fminnum.ll Thu May 10 08:40:49 2018
@@ -277,7 +277,7 @@ define <8 x double> @test_intrinsic_fmin
ret <8 x double> %z
}
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
@@ -325,49 +325,33 @@ define <2 x double> @minnum_intrinsic_nn
ret <2 x double> %r
}
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 {
; SSE-LABEL: minnum_intrinsic_nnan_attr_f64:
; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: cmpunordsd %xmm0, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm1, %xmm3
-; SSE-NEXT: minsd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm1, %xmm2
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: minsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: minnum_intrinsic_nnan_attr_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call double @llvm.minnum.f64(double %a, double %b)
ret double %r
}
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
define <4 x float> @minnum_intrinsic_nnan_attr_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; SSE-LABEL: minnum_intrinsic_nnan_attr_v4f32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm1, %xmm2
-; SSE-NEXT: minps %xmm0, %xmm2
-; SSE-NEXT: cmpunordps %xmm0, %xmm0
-; SSE-NEXT: andps %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm2, %xmm0
-; SSE-NEXT: orps %xmm1, %xmm0
+; SSE-NEXT: minps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: minnum_intrinsic_nnan_attr_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vminps %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %r
More information about the llvm-commits
mailing list