[llvm] [X86] Add missing vNbf16 handling in X86CallingConv.td file (PR #127102)

Tue Feb 18 01:44:33 PST 2025

================
@@ -0,0 +1,1162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=SSE2 %s
+; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=FAST_ISEL_SSE2 %s
+; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=AVX512BF16 %s
+; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=FAST_ISEL_AVX512BF16 %s
+; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=AVXNECONVERT %s
+; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=FAST_ISEL_AVXNECONVERT %s
+
+define bfloat @return_arg_bf16(bfloat %x, bfloat %y) #0 {
+; SSE2-LABEL: return_arg_bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; FAST_ISEL_SSE2-LABEL: return_arg_bf16:
+; FAST_ISEL_SSE2:       # %bb.0:
+; FAST_ISEL_SSE2-NEXT:    pushq %rax
+; FAST_ISEL_SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; FAST_ISEL_SSE2-NEXT:    shll $16, %eax
+; FAST_ISEL_SSE2-NEXT:    movd %eax, %xmm0
+; FAST_ISEL_SSE2-NEXT:    callq __truncsfbf2 at PLT
+; FAST_ISEL_SSE2-NEXT:    popq %rax
+; FAST_ISEL_SSE2-NEXT:    retq
+;
+; AVX512BF16-LABEL: return_arg_bf16:
+; AVX512BF16:       # %bb.0:
+; AVX512BF16-NEXT:    retq
+;
+; FAST_ISEL_AVX512BF16-LABEL: return_arg_bf16:
+; FAST_ISEL_AVX512BF16:       # %bb.0:
+; FAST_ISEL_AVX512BF16-NEXT:    vpextrw $0, %xmm0, %eax
+; FAST_ISEL_AVX512BF16-NEXT:    shll $16, %eax
+; FAST_ISEL_AVX512BF16-NEXT:    vmovd %eax, %xmm0
+; FAST_ISEL_AVX512BF16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
+; FAST_ISEL_AVX512BF16-NEXT:    retq
+;
+; AVXNECONVERT-LABEL: return_arg_bf16:
+; AVXNECONVERT:       # %bb.0:
+; AVXNECONVERT-NEXT:    retq
+;
+; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_bf16:
+; FAST_ISEL_AVXNECONVERT:       # %bb.0:
+; FAST_ISEL_AVXNECONVERT-NEXT:    vpextrw $0, %xmm0, %eax
+; FAST_ISEL_AVXNECONVERT-NEXT:    shll $16, %eax
+; FAST_ISEL_AVXNECONVERT-NEXT:    vmovd %eax, %xmm0
+; FAST_ISEL_AVXNECONVERT-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
+; FAST_ISEL_AVXNECONVERT-NEXT:    retq
+  ret bfloat %x
----------------
mikolaj-pirog wrote:

The `ret bfloat %y` yields essentially the same codegen (save from using xmm1 instead of xmm0), so I have opted to use a single arg in each function

https://github.com/llvm/llvm-project/pull/127102