[llvm] [NVPTX] Add support for f16 fabs (PR #116107)

Wed Nov 13 14:28:07 PST 2024

================
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Some FP16 support but not for abs
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
+
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
+; RUN:           -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math   \
+; RUN:           -verify-machineinstrs                                        \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
+
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52  \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                               \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52  \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
+; RUN:   | %ptxas-verify -arch=sm_52                                              \
+; RUN: %}
+
+; ## Full FP16 support.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-F16-ABS %s
+; RUN: %if ptxas %{                                                               \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
+; RUN:   | %ptxas-verify -arch=sm_53                                              \
+; RUN: %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare half @llvm.fabs.f16(half %a)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+
+define half @test_fabs(half %a) {
+; CHECK-NOF16-LABEL: test_fabs(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NOF16-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs(
+; CHECK-F16-ABS:       {
+; CHECK-F16-ABS-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT:  // %bb.0:
+; CHECK-F16-ABS-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-F16-ABS-NEXT:    abs.f16 %rs2, %rs1;
+; CHECK-F16-ABS-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-F16-ABS-NEXT:    ret;
+  %r = call half @llvm.fabs.f16(half %a)
+  ret half %r
+}
+
+define <2 x half> @test_fabs_2(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fabs_2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-NEXT:    and.b32 %r3, %r1, 2147450879;
----------------
AlexMaclean wrote:

I think it is fine from the perspective of llvm IR's semantics to implement f16 abs with an and is fine, it's probably more conformant because it will preserve NaN payloads, while the conversions may not. That being said, I'm not sure about the perf implications of going this route. Maybe in some cases maybe the abs could be strung together with other promoted operations and result in better codegen? 

https://github.com/llvm/llvm-project/pull/116107