[llvm] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS/FNEG/FCOPYSIGN (f16) with Expand (PR #106153)

Tue Aug 27 16:53:43 PDT 2024

================
@@ -1183,17 +1183,13 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 }
 
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
+; CHECK:    ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK:    mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK:    and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767;
+; CHECK:    and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767;
+; CHECK:    mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]};
----------------
v01dXYZ wrote:

LegalizeType would type-expand `FABS` (`<2 x f16>`).

The DAG just before instruction selection:

```
Optimized legalized selection DAG: %bb.0 'test_fabs:'
SelectionDAG has 21 nodes:
  t0: ch,glue = EntryToken
  t14: v2f16,ch = load<(dereferenceable invariant load (s32) from `ptr addrspace(101) null`, addrspace 101)> t0, TargetExternalSymbol:i64'test_fabs_param_0', undef:i64
      t8: ch = CopyToReg t0, Register:v2f16 %0, t14
              t16: f16 = extract_vector_elt t14, Constant:i64<0>
            t22: i16 = bitcast t16
          t24: i16 = and t22, Constant:i16<32767>
        t25: f16 = bitcast t24
              t19: f16 = extract_vector_elt t14, Constant:i64<1>
            t26: i16 = bitcast t19
          t27: i16 = and t26, Constant:i16<32767>
        t28: f16 = bitcast t27
      t21: v2f16 = BUILD_VECTOR t25, t28
    t11: ch = NVPTXISD::StoreRetval<(store (s32), align 1)> t8, Constant:i32<0>, t21
  t12: ch = NVPTXISD::RET_GLUE t11
``` 

Maybe detect a composition of bitwise operations on each element and merge it into a composition on the bitcasted vector itself (starting from `BUILD_VECTOR`).

BTW that's also the case with `x86`:

``` llvm
define <2 x half> @test_fabs(<2 x half> %a) #0 {
  %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
  ret <2 x half> %r
}
``` 

``` asm
        .text
        .file   "fabs_nvptx.ll"
        .globl  test_fabs                       # -- Begin function test_fabs
        .p2align        4, 0x90
        .type   test_fabs, at function
test_fabs:                              # @test_fabs
        .cfi_startproc
# %bb.0:
        pextrw  $0, %xmm0, %eax
        psrld   $16, %xmm0
        pextrw  $0, %xmm0, %ecx
        andl    $32767, %ecx                    # imm = 0x7FFF
        pinsrw  $0, %ecx, %xmm1
        andl    $32767, %eax                    # imm = 0x7FFF
        pinsrw  $0, %eax, %xmm0
        punpcklwd       %xmm1, %xmm0            # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        retq
.Lfunc_end0:
        .size   test_fabs, .Lfunc_end0-test_fabs
        .cfi_endproc
                                        # -- End function
        .section        ".note.GNU-stack","", at progbits
```

But `AArch64`  seems not to do so (there is a combiner that combine the build_vector into a `concat_vector ..., undef`).


https://github.com/llvm/llvm-project/pull/106153