[llvm] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS.f16 with Expand (PR #106153)

Mon Aug 26 15:34:55 PDT 2024

https://github.com/v01dXYZ created https://github.com/llvm/llvm-project/pull/106153

For the concerned targets (X86, NVPTX), `Expand` clears the sign bit after bitcasting to i16. This is cheaper than `Promote` that requires converting back and forth from f16 to f32 (with possibly expensive libcalls) in order to call FABS.f32

>From 87097a5df5afe34eb09b90f01ad80d5411b5e0d4 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 27 Aug 2024 00:02:14 +0200
Subject: [PATCH] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS.f16
 with Expand

For the concerned targets, `Expand` clears the sign bit after
bitcasting to i16. This is cheaper than `Promote` that requires
converting back and forth from f16 to f32 (with possibly expensive
libcalls) in order to call FABS.f32
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  3 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  3 +-
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   | 12 +++----
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 18 ++++------
 llvm/test/CodeGen/X86/fp16-libcalls.ll        | 35 ++++---------------
 llvm/test/CodeGen/X86/half.ll                 | 12 +++----
 6 files changed, 27 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 73791102fc04de..5b0a87d5f150f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -849,7 +849,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
   for (const auto &Op : {ISD::FABS}) {
-    setOperationAction(Op, MVT::f16, Promote);
+    // Expand instead of Promote to clear sign bit by bitcasting to i16
+    setOperationAction(Op, MVT::f16, Expand);
     setOperationAction(Op, MVT::f32, Legal);
     setOperationAction(Op, MVT::f64, Legal);
     setOperationAction(Op, MVT::v2f16, Expand);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a6be4eb5af1ef..274a03b6664736 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -599,7 +599,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
 
   auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
-    setOperationAction(ISD::FABS, VT, Action);
+    // Expand instead of Promote to clear sign bit by bitcasting to i16.
+    setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FNEG, VT, Action);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FREM, VT, Action);
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 14e02a49f6e5e4..cfabf1d6639c2a 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -981,14 +981,10 @@ define half @test_fma(half %a, half %b, half %c) #0 {
 }
 
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fabs_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ:      abs.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      abs.ftz.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
+; CHECK:    ld.param.b16    [[A:%rs[0-9]+]], [test_fabs_param_0];
+; CHECK:    and.b16 [[RF:%rs[0-9]+]], [[A]], 32767;
+; CHECK:    st.param.b16 [func_retval0+0], [[RF]];
+; CHECK:    ret;
 define half @test_fabs(half %a) #0 {
   %r = call half @llvm.fabs.f16(half %a)
   ret half %r
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 464b3a754804fe..c2189a20c98c25 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1183,17 +1183,13 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 }
 
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
+; CHECK:    ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK:    mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK:    and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767;
+; CHECK:    and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767;
+; CHECK:    mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]};
+; CHECK:    st.param.b32 [func_retval0+0], [[B]];
+; CHECK:    ret;
 define <2 x half> @test_fabs(<2 x half> %a) #0 {
   %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
   ret <2 x half> %r
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index db3d031a8fe3fb..ee342f22eeff66 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -124,11 +124,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fabs:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
-; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    andl $32767, %eax # imm = 0x7FFF
 ; F16C-NEXT:    movw %ax, (%rdi)
 ; F16C-NEXT:    retq
 ;
@@ -141,34 +137,17 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ;
 ; X64-LABEL: test_half_fabs:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
-; X64-NEXT:    movw %ax, (%rbx)
-; X64-NEXT:    popq %rbx
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    movw %ax, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_half_fabs:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl
   %res = call half @llvm.fabs.half(half %a0)
   store half %res, ptr %p0, align 2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 9f01d07e6a6705..6838925240058e 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1059,7 +1059,6 @@ define void @main.158() #0 {
 ; CHECK-LIBCALL:       # %bb.0: # %entry
 ; CHECK-LIBCALL-NEXT:    pushq %rax
 ; CHECK-LIBCALL-NEXT:    xorps %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
 ; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
 ; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-LIBCALL-NEXT:    ucomiss %xmm0, %xmm1
@@ -1077,10 +1076,10 @@ define void @main.158() #0 {
 ; BWON-F16C-LABEL: main.158:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm2
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; BWON-F16C-NEXT:    vucomiss %xmm0, %xmm1
+; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    jae .LBB20_2
 ; BWON-F16C-NEXT:  # %bb.1: # %entry
 ; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
@@ -1093,8 +1092,7 @@ define void @main.158() #0 {
 ; CHECK-I686-LABEL: main.158:
 ; CHECK-I686:       # %bb.0: # %entry
 ; CHECK-I686-NEXT:    subl $12, %esp
-; CHECK-I686-NEXT:    movl $0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
+; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2