[llvm] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS/FNEG/FCOPYSIGN (f16) with Expand (PR #106153)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 27 18:49:45 PDT 2024
https://github.com/v01dXYZ updated https://github.com/llvm/llvm-project/pull/106153
>From 02f912a92087318eb940040bbdb737f9f6b18c34 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Wed, 28 Aug 2024 00:45:57 +0200
Subject: [PATCH 1/2] [X86] test pre-commit: Legalise FNEG/FCOPYSIGN (f16) with
Expand
---
llvm/test/CodeGen/X86/fp16-libcalls.ll | 98 ++++++++++++++++++++++++++
1 file changed, 98 insertions(+)
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index db3d031a8fe3fb..bd1dc47efa41ff 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -175,6 +175,104 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
ret void
}
+define void @test_half_fneg(half %a0, ptr %p0) nounwind {
+; F16C-LABEL: test_half_fneg:
+; F16C: # %bb.0:
+; F16C-NEXT: vpextrw $0, %xmm0, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: retq
+;
+; FP16-LABEL: test_half_fneg:
+; FP16: # %bb.0:
+; FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FP16-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; FP16-NEXT: vmovsh %xmm0, (%rdi)
+; FP16-NEXT: retq
+;
+; X64-LABEL: test_half_fneg:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: pextrw $0, %xmm0, %eax
+; X64-NEXT: movw %ax, (%rbx)
+; X64-NEXT: popq %rbx
+; X64-NEXT: retq
+;
+; X86-LABEL: test_half_fneg:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: movd %xmm0, (%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esi)
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+ %res = fneg half %a0
+ store half %res, ptr %p0, align 2
+ ret void
+}
+
+define void @test_half_fcopysign(half %a0, half %a1, ptr %p0) nounwind {
+; F16C-LABEL: test_half_fcopysign:
+; F16C: # %bb.0:
+; F16C-NEXT: vpextrw $0, %xmm1, %eax
+; F16C-NEXT: andl $32768, %eax # imm = 0x8000
+; F16C-NEXT: vpextrw $0, %xmm0, %ecx
+; F16C-NEXT: andl $32767, %ecx # imm = 0x7FFF
+; F16C-NEXT: orl %eax, %ecx
+; F16C-NEXT: movw %cx, (%rdi)
+; F16C-NEXT: retq
+;
+; FP16-LABEL: test_half_fcopysign:
+; FP16: # %bb.0:
+; FP16-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; FP16-NEXT: vpternlogd $202, %xmm1, %xmm0, %xmm2
+; FP16-NEXT: vmovsh %xmm2, (%rdi)
+; FP16-NEXT: retq
+;
+; X64-LABEL: test_half_fcopysign:
+; X64: # %bb.0:
+; X64-NEXT: pextrw $0, %xmm1, %eax
+; X64-NEXT: andl $32768, %eax # imm = 0x8000
+; X64-NEXT: pextrw $0, %xmm0, %ecx
+; X64-NEXT: andl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
+;
+; X86-LABEL: test_half_fcopysign:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $32768, %ecx # imm = 0x8000
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $32767, %edx # imm = 0x7FFF
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movw %dx, (%eax)
+; X86-NEXT: retl
+ %res = call half @llvm.copysign.half(half %a0, half %a1)
+ store half %res, ptr %p0, align 2
+ ret void
+}
+
define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
; F16C-LABEL: test_half_pow:
; F16C: # %bb.0:
>From 2c1a3e15104cef8afa2887aedf72e0d76d5915b1 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 27 Aug 2024 00:02:14 +0200
Subject: [PATCH 2/2] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize
FABS/FNEG/FCOPYSIGN (f16) with Expand
For the concerned targets, `Expand` clears/flips/copies the sign bit
after bitcasting to i16. This is cheaper than `Promote` that requires
converting back and forth from f16 to f32 (with possibly expensive
libcalls) in order to call FABS/FNEG/FCOPYSIGN (f32).
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +-
llvm/test/CodeGen/NVPTX/f16-instructions.ll | 12 ++--
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 19 +++--
llvm/test/CodeGen/X86/fp16-libcalls.ll | 70 ++++---------------
llvm/test/CodeGen/X86/half.ll | 12 ++--
6 files changed, 39 insertions(+), 84 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 73791102fc04de..5b0a87d5f150f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -849,7 +849,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
AddPromotedToType(Op, MVT::bf16, MVT::f32);
}
for (const auto &Op : {ISD::FABS}) {
- setOperationAction(Op, MVT::f16, Promote);
+ // Expand instead of Promote to clear sign bit by bitcasting to i16
+ setOperationAction(Op, MVT::f16, Expand);
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
setOperationAction(Op, MVT::v2f16, Expand);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a6be4eb5af1ef..403ea93b62f6b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -601,7 +601,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
setOperationAction(ISD::FABS, VT, Action);
setOperationAction(ISD::FNEG, VT, Action);
- setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Action);
setOperationAction(ISD::FREM, VT, Action);
setOperationAction(ISD::FMA, VT, Action);
setOperationAction(ISD::FMINNUM, VT, Action);
@@ -672,6 +672,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Half type will be promoted by default.
setF16Action(MVT::f16, Promote);
+ // Expand instead of Promote to clear/flip/copy sign bit by bitcasting to
+ // i16.
+ setOperationAction(ISD::FABS, MVT::f16, Expand);
+ setOperationAction(ISD::FNEG, MVT::f16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 14e02a49f6e5e4..cfabf1d6639c2a 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -981,14 +981,10 @@ define half @test_fma(half %a, half %b, half %c) #0 {
}
; CHECK-LABEL: test_fabs(
-; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0];
-; CHECK-NOFTZ: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ: abs.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ: abs.ftz.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
+; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0];
+; CHECK: and.b16 [[RF:%rs[0-9]+]], [[A]], 32767;
+; CHECK: st.param.b16 [func_retval0+0], [[RF]];
+; CHECK: ret;
define half @test_fabs(half %a) #0 {
%r = call half @llvm.fabs.f16(half %a)
ret half %r
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 464b3a754804fe..abbf9c2a139216 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1182,18 +1182,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
ret <2 x half> %r
}
+; TODO: This should be optimised to use only one AND on the i32 register.
; CHECK-LABEL: test_fabs(
-; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
+; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK: and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767;
+; CHECK: and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767;
+; CHECK: mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]};
+; CHECK: st.param.b32 [func_retval0+0], [[B]];
+; CHECK: ret;
define <2 x half> @test_fabs(<2 x half> %a) #0 {
%r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
ret <2 x half> %r
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index bd1dc47efa41ff..c20e91f46a0a3d 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -124,11 +124,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_fabs:
; F16C: # %bb.0:
; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: andl $32767, %eax # imm = 0x7FFF
; F16C-NEXT: movw %ax, (%rdi)
; F16C-NEXT: retq
;
@@ -141,34 +137,17 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
;
; X64-LABEL: test_half_fabs:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: movw %ax, (%rbx)
-; X64-NEXT: popq %rbx
+; X64-NEXT: andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT: movw %ax, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: test_half_fabs:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esi)
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: retl
%res = call half @llvm.fabs.half(half %a0)
store half %res, ptr %p0, align 2
@@ -179,11 +158,7 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_fneg:
; F16C: # %bb.0:
; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: xorl $32768, %eax # imm = 0x8000
; F16C-NEXT: movw %ax, (%rdi)
; F16C-NEXT: retq
;
@@ -196,34 +171,17 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
;
; X64-LABEL: test_half_fneg:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: movw %ax, (%rbx)
-; X64-NEXT: popq %rbx
+; X64-NEXT: xorl $32768, %eax # imm = 0x8000
+; X64-NEXT: movw %ax, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: test_half_fneg:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esi)
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $32768, %ecx # imm = 0x8000
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: retl
%res = fneg half %a0
store half %res, ptr %p0, align 2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 9f01d07e6a6705..6838925240058e 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1059,7 +1059,6 @@ define void @main.158() #0 {
; CHECK-LIBCALL: # %bb.0: # %entry
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT: callq __truncsfhf2 at PLT
; CHECK-LIBCALL-NEXT: callq __extendhfsf2 at PLT
; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-LIBCALL-NEXT: ucomiss %xmm0, %xmm1
@@ -1077,10 +1076,10 @@ define void @main.158() #0 {
; BWON-F16C-LABEL: main.158:
; BWON-F16C: # %bb.0: # %entry
; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; BWON-F16C-NEXT: vucomiss %xmm0, %xmm1
+; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: jae .LBB20_2
; BWON-F16C-NEXT: # %bb.1: # %entry
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
@@ -1093,8 +1092,7 @@ define void @main.158() #0 {
; CHECK-I686-LABEL: main.158:
; CHECK-I686: # %bb.0: # %entry
; CHECK-I686-NEXT: subl $12, %esp
-; CHECK-I686-NEXT: movl $0, (%esp)
-; CHECK-I686-NEXT: calll __truncsfhf2
+; CHECK-I686-NEXT: pxor %xmm0, %xmm0
; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax
; CHECK-I686-NEXT: movw %ax, (%esp)
; CHECK-I686-NEXT: calll __extendhfsf2
More information about the llvm-commits
mailing list