[llvm] 90c31b0 - [X86] Custom lower ISD::FROUND with SSE4.1 to avoid a libcall.

Wed Jan 29 09:10:53 PST 2020

Author: Craig Topper
Date: 2020-01-29T09:10:02-08:00
New Revision: 90c31b0f428fe911255277a60782ea9114700475

URL: https://github.com/llvm/llvm-project/commit/90c31b0f428fe911255277a60782ea9114700475
DIFF: https://github.com/llvm/llvm-project/commit/90c31b0f428fe911255277a60782ea9114700475.diff

LOG: [X86] Custom lower ISD::FROUND with SSE4.1 to avoid a libcall.

ISD::FROUND is defined to round to nearest with ties rounding
away from 0. This mode isn't supported in hardware on X86.

But as long as we aren't compiling with trapping math, we can
emulate this with floor(X + copysign(nextafter(0.5, 0.0), X)).

We have to use nextafter to avoid some corner cases that adding
0.5 would have. For example, if X is nextafter(0.5, 0.0) it should
round to 0.0, but adding 0.5 would need one extra bit of mantissa
than can be stored so it rounds to 1.0. Adding nextafter(0.5, 0.0)
instead will just increase the exponent by 1 and leave the mantissa
as all 1s. This would be nextafter(1.0, 0.0) which will floor to 0.0.

Techically this requires -fno-trapping-math which isn't our default.
But if we care about exceptions we should be using constrained
intrinsics. Constrained intrinsics would use STRICT_FROUND which
won't go through this code.

Fixes PR42195.

Differential Revision: https://reviews.llvm.org/D73607

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/extractelement-fp.ll
    llvm/test/CodeGen/X86/vec-libcalls.ll

Removed: 
    llvm/test/CodeGen/X86/vec_round.ll


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8eaa8751bfe..cba0199e2272 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1078,6 +1078,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
+
+      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
     }
 
     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
@@ -1170,6 +1172,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+
+      setOperationAction(ISD::FROUND,            VT, Custom);
+
       setOperationAction(ISD::FNEG,              VT, Custom);
       setOperationAction(ISD::FABS,              VT, Custom);
       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
@@ -1535,6 +1540,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
 
+      setOperationAction(ISD::FROUND,            VT, Custom);
+
       setOperationAction(ISD::SELECT,           VT, Custom);
     }
 
@@ -20450,6 +20457,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
 }
 
+/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
+/// This mode isn't supported in hardware on X86. But as long as we aren't
+/// compiling with trapping math, we can emulate this with
+/// floor(X + copysign(nextafter(0.5, 0.0), X)).
+static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // N0 += copysign(nextafter(0.5, 0.0), N0)
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+  bool Ignored;
+  APFloat Point5Pred = APFloat(0.5f);
+  Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
+  Point5Pred.next(/*nextDown*/true);
+
+  SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
+                              DAG.getConstantFP(Point5Pred, dl, VT), N0);
+  N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
+
+  // Truncate the result to remove fraction.
+  return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
+}
+
 /// The only 
diff erences between FABS and FNEG are the mask and the logic op.
 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
@@ -28623,6 +28654,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   case ISD::FADD:
   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
+  case ISD::FROUND:             return LowerFROUND(Op, DAG);
   case ISD::FABS:
   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);

diff  --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 7d5f18b59e8c..7e3e263f0db7 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -1067,13 +1067,25 @@ define double @nearbyint_v4f64(<4 x double> %x) nounwind {
 define float @round_v4f32(<4 x float> %x) nounwind {
 ; X64-LABEL: round_v4f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    jmp roundf # TAILCALL
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT:    vandps %xmm1, %xmm0, %xmm1
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; X64-NEXT:    vorps %xmm1, %xmm2, %xmm1
+; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: round_v4f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vandps %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; X86-NEXT:    vorps %xmm1, %xmm2, %xmm1
+; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    calll roundf
+; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    retl
   %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
@@ -1084,17 +1096,32 @@ define float @round_v4f32(<4 x float> %x) nounwind {
 define double @round_v4f64(<4 x double> %x) nounwind {
 ; X64-LABEL: round_v4f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-NEXT:    vandpd {{.*}}(%rip), %xmm0, %xmm1
+; X64-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
+; X64-NEXT:    # xmm2 = mem[0,0]
+; X64-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
-; X64-NEXT:    jmp round # TAILCALL
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: round_v4f64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vandpd {{\.LCPI.*}}, %xmm0, %xmm1
+; X86-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
+; X86-NEXT:    # xmm2 = mem[0,0]
+; X86-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    vzeroupper
-; X86-NEXT:    calll round
-; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    retl
   %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
   %r = extractelement <4 x double> %v, i32 0

diff  --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll
index 5e8d3211459e..83663fc899d1 100644
--- a/llvm/test/CodeGen/X86/vec-libcalls.ll
+++ b/llvm/test/CodeGen/X86/vec-libcalls.ll
@@ -386,16 +386,10 @@ define <2 x float> @rint_v2f32(<2 x float> %x) nounwind {
 define <2 x float> @round_v2f32(<2 x float> %x) nounwind {
 ; CHECK-LABEL: round_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $40, %rsp
-; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq roundf
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT:    callq roundf
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-NEXT:    vorps {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vroundps $11, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %r = call <2 x float> @llvm.round.v2f32(<2 x float> %x)
   ret <2 x float> %r

diff  --git a/llvm/test/CodeGen/X86/vec_round.ll b/llvm/test/CodeGen/X86/vec_round.ll
deleted file mode 100644
index 6bc38400aa34..000000000000
--- a/llvm/test/CodeGen/X86/vec_round.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=nehalem -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @use(<2 x double>)
-
-; Function Attrs: nounwind uwtable
-define void @test() {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq round
-; CHECK-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    callq use
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
-entry:
-  %tmp = call <2 x double> @llvm.round.v2f64(<2 x double> undef)
-  call void @use(<2 x double> %tmp)
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare <2 x double> @llvm.round.v2f64(<2 x double>) #0
-
-attributes #0 = { nounwind readonly }
-