[llvm] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS.f16 with Expand (PR #106153)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 26 15:34:55 PDT 2024
https://github.com/v01dXYZ created https://github.com/llvm/llvm-project/pull/106153
For the concerned targets (X86, NVPTX), `Expand` clears the sign bit after bitcasting to i16. This is cheaper than `Promote` that requires converting back and forth from f16 to f32 (with possibly expensive libcalls) in order to call FABS.f32
>From 87097a5df5afe34eb09b90f01ad80d5411b5e0d4 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 27 Aug 2024 00:02:14 +0200
Subject: [PATCH] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS.f16
with Expand
For the concerned targets, `Expand` clears the sign bit after
bitcasting to i16. This is cheaper than `Promote` that requires
converting back and forth from f16 to f32 (with possibly expensive
libcalls) in order to call FABS.f32
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-
llvm/test/CodeGen/NVPTX/f16-instructions.ll | 12 +++----
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 18 ++++------
llvm/test/CodeGen/X86/fp16-libcalls.ll | 35 ++++---------------
llvm/test/CodeGen/X86/half.ll | 12 +++----
6 files changed, 27 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 73791102fc04de..5b0a87d5f150f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -849,7 +849,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
AddPromotedToType(Op, MVT::bf16, MVT::f32);
}
for (const auto &Op : {ISD::FABS}) {
- setOperationAction(Op, MVT::f16, Promote);
+ // Expand instead of Promote to clear sign bit by bitcasting to i16
+ setOperationAction(Op, MVT::f16, Expand);
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
setOperationAction(Op, MVT::v2f16, Expand);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a6be4eb5af1ef..274a03b6664736 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -599,7 +599,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
- setOperationAction(ISD::FABS, VT, Action);
+ // Expand instead of Promote to clear sign bit by bitcasting to i16.
+ setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FNEG, VT, Action);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FREM, VT, Action);
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 14e02a49f6e5e4..cfabf1d6639c2a 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -981,14 +981,10 @@ define half @test_fma(half %a, half %b, half %c) #0 {
}
; CHECK-LABEL: test_fabs(
-; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0];
-; CHECK-NOFTZ: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ: abs.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ: abs.ftz.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
+; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0];
+; CHECK: and.b16 [[RF:%rs[0-9]+]], [[A]], 32767;
+; CHECK: st.param.b16 [func_retval0+0], [[RF]];
+; CHECK: ret;
define half @test_fabs(half %a) #0 {
%r = call half @llvm.fabs.f16(half %a)
ret half %r
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 464b3a754804fe..c2189a20c98c25 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1183,17 +1183,13 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
}
; CHECK-LABEL: test_fabs(
-; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
+; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK: and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767;
+; CHECK: and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767;
+; CHECK: mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]};
+; CHECK: st.param.b32 [func_retval0+0], [[B]];
+; CHECK: ret;
define <2 x half> @test_fabs(<2 x half> %a) #0 {
%r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
ret <2 x half> %r
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index db3d031a8fe3fb..ee342f22eeff66 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -124,11 +124,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_fabs:
; F16C: # %bb.0:
; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: andl $32767, %eax # imm = 0x7FFF
; F16C-NEXT: movw %ax, (%rdi)
; F16C-NEXT: retq
;
@@ -141,34 +137,17 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
;
; X64-LABEL: test_half_fabs:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: movw %ax, (%rbx)
-; X64-NEXT: popq %rbx
+; X64-NEXT: andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT: movw %ax, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: test_half_fabs:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esi)
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: retl
%res = call half @llvm.fabs.half(half %a0)
store half %res, ptr %p0, align 2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 9f01d07e6a6705..6838925240058e 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1059,7 +1059,6 @@ define void @main.158() #0 {
; CHECK-LIBCALL: # %bb.0: # %entry
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT: callq __truncsfhf2 at PLT
; CHECK-LIBCALL-NEXT: callq __extendhfsf2 at PLT
; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-LIBCALL-NEXT: ucomiss %xmm0, %xmm1
@@ -1077,10 +1076,10 @@ define void @main.158() #0 {
; BWON-F16C-LABEL: main.158:
; BWON-F16C: # %bb.0: # %entry
; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; BWON-F16C-NEXT: vucomiss %xmm0, %xmm1
+; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: jae .LBB20_2
; BWON-F16C-NEXT: # %bb.1: # %entry
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
@@ -1093,8 +1092,7 @@ define void @main.158() #0 {
; CHECK-I686-LABEL: main.158:
; CHECK-I686: # %bb.0: # %entry
; CHECK-I686-NEXT: subl $12, %esp
-; CHECK-I686-NEXT: movl $0, (%esp)
-; CHECK-I686-NEXT: calll __truncsfhf2
+; CHECK-I686-NEXT: pxor %xmm0, %xmm0
; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax
; CHECK-I686-NEXT: movw %ax, (%esp)
; CHECK-I686-NEXT: calll __extendhfsf2
More information about the llvm-commits
mailing list