[llvm] 9177e81 - X86: Fix asserting on bfloat argument/return without sse2 (#93146)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 07:48:03 PDT 2024
Author: Matt Arsenault
Date: 2024-09-30T18:48:00+04:00
New Revision: 9177e812677ee915c6606498db57808c13ee892a
URL: https://github.com/llvm/llvm-project/commit/9177e812677ee915c6606498db57808c13ee892a
DIFF: https://github.com/llvm/llvm-project/commit/9177e812677ee915c6606498db57808c13ee892a.diff
LOG: X86: Fix asserting on bfloat argument/return without sse2 (#93146)
These now get the default promote-to-float behavior, like half does.
Fixes #92899
Added:
llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
Modified:
llvm/lib/Target/X86/X86ISelLoweringCall.cpp
llvm/test/CodeGen/X86/atomic-non-integer.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b9124658028da4..8561658379f7e4 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -124,12 +124,14 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
!Subtarget.hasX87())
return MVT::i32;
- if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
- return getRegisterTypeForCallingConv(Context, CC,
- VT.changeVectorElementType(MVT::f16));
+ if (isTypeLegal(MVT::f16)) {
+ if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+ return getRegisterTypeForCallingConv(
+ Context, CC, VT.changeVectorElementType(MVT::f16));
- if (VT == MVT::bf16)
- return MVT::f16;
+ if (VT == MVT::bf16)
+ return MVT::f16;
+ }
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -162,7 +164,8 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
return 3;
}
- if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+ if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+ isTypeLegal(MVT::f16))
return getNumRegistersForCallingConv(Context, CC,
VT.changeVectorElementType(MVT::f16));
@@ -194,7 +197,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
}
// Split vNbf16 vectors according to vNf16.
- if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+ if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+ isTypeLegal(MVT::f16))
VT = VT.changeVectorElementType(MVT::f16);
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index d7633cb11e44c1..bd794712a31096 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -789,12 +789,55 @@ define double @load_double_seq_cst(ptr %fptr) {
}
define void @store_bfloat(ptr %fptr, bfloat %v) {
-; X86-LABEL: store_bfloat:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: retl
+; X86-SSE1-LABEL: store_bfloat:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: subl $8, %esp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
+; X86-SSE1-NEXT: .cfi_offset %esi, -8
+; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT: movss %xmm0, (%esp)
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE1-NEXT: calll __truncsfbf2
+; X86-SSE1-NEXT: movw %ax, (%esi)
+; X86-SSE1-NEXT: addl $8, %esp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: popl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT: retl
+;
+; X86-SSE2-LABEL: store_bfloat:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movw %cx, (%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-AVX-LABEL: store_bfloat:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movw %cx, (%eax)
+; X86-AVX-NEXT: retl
+;
+; X86-NOSSE-LABEL: store_bfloat:
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl %esi
+; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT: subl $8, %esp
+; X86-NOSSE-NEXT: .cfi_def_cfa_offset 16
+; X86-NOSSE-NEXT: .cfi_offset %esi, -8
+; X86-NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: fstps (%esp)
+; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT: calll __truncsfbf2
+; X86-NOSSE-NEXT: movw %ax, (%esi)
+; X86-NOSSE-NEXT: addl $8, %esp
+; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT: popl %esi
+; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4
+; X86-NOSSE-NEXT: retl
;
; X64-SSE-LABEL: store_bfloat:
; X64-SSE: # %bb.0:
@@ -811,8 +854,7 @@ define void @store_bfloat(ptr %fptr, bfloat %v) {
ret void
}
-; Work around issue #92899 by casting to float
-define float @load_bfloat(ptr %fptr) {
+define bfloat @load_bfloat(ptr %fptr) {
; X86-SSE1-LABEL: load_bfloat:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl %eax
@@ -828,30 +870,16 @@ define float @load_bfloat(ptr %fptr) {
;
; X86-SSE2-LABEL: load_bfloat:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %eax
-; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzwl (%eax), %eax
-; X86-SSE2-NEXT: shll $16, %eax
-; X86-SSE2-NEXT: movd %eax, %xmm0
-; X86-SSE2-NEXT: movd %xmm0, (%esp)
-; X86-SSE2-NEXT: flds (%esp)
-; X86-SSE2-NEXT: popl %eax
-; X86-SSE2-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT: pinsrw $0, %eax, %xmm0
; X86-SSE2-NEXT: retl
;
; X86-AVX-LABEL: load_bfloat:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %eax
-; X86-AVX-NEXT: .cfi_def_cfa_offset 8
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movzwl (%eax), %eax
-; X86-AVX-NEXT: shll $16, %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm0
-; X86-AVX-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX-NEXT: flds (%esp)
-; X86-AVX-NEXT: popl %eax
-; X86-AVX-NEXT: .cfi_def_cfa_offset 4
+; X86-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X86-NOSSE-LABEL: load_bfloat:
@@ -870,17 +898,14 @@ define float @load_bfloat(ptr %fptr) {
; X64-SSE-LABEL: load_bfloat:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movzwl (%rdi), %eax
-; X64-SSE-NEXT: shll $16, %eax
-; X64-SSE-NEXT: movd %eax, %xmm0
+; X64-SSE-NEXT: pinsrw $0, %eax, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: load_bfloat:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movzwl (%rdi), %eax
-; X64-AVX-NEXT: shll $16, %eax
-; X64-AVX-NEXT: vmovd %eax, %xmm0
+; X64-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; X64-AVX-NEXT: retq
%v = load atomic bfloat, ptr %fptr unordered, align 2
- %ext = fpext bfloat %v to float
- ret float %ext
+ ret bfloat %v
}
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
new file mode 100644
index 00000000000000..f363cad816dfb2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
@@ -0,0 +1,1243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-linux-gnu < %s | FileCheck -check-prefixes=CHECK,NOSSE %s
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse < %s | FileCheck -check-prefixes=CHECK,SSE %s
+
+; Make sure no assert without SSE2 and bfloat. Issue 92899
+
+define bfloat @return_arg_bf16(bfloat %x) #0 {
+; CHECK-LABEL: return_arg_bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: retl
+ ret bfloat %x
+}
+
+define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 {
+; CHECK-LABEL: return_arg_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: retl
+ ret <2 x bfloat> %x
+}
+
+define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v3bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: pushl %eax
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: # kill: def $ax killed $ax def $eax
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movzwl %si, %edi
+; NOSSE-NEXT: orl %eax, %edi
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 4(%esi)
+; NOSSE-NEXT: movl %edi, (%esi)
+; NOSSE-NEXT: movl %esi, %eax
+; NOSSE-NEXT: addl $4, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v3bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: pushl %eax
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: # kill: def $ax killed $ax def $eax
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movzwl %si, %edi
+; SSE-NEXT: orl %eax, %edi
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 4(%esi)
+; SSE-NEXT: movl %edi, (%esi)
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: addl $4, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: retl $4
+ ret <3 x bfloat> %x
+}
+
+define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v4bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $12, %esp
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %edi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebx
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 6(%ebp)
+; NOSSE-NEXT: movw %bx, 4(%ebp)
+; NOSSE-NEXT: movw %di, 2(%ebp)
+; NOSSE-NEXT: movw %si, (%ebp)
+; NOSSE-NEXT: movl %ebp, %eax
+; NOSSE-NEXT: addl $12, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v4bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $12, %esp
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %edi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %ebx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 6(%ebp)
+; SSE-NEXT: movw %bx, 4(%ebp)
+; SSE-NEXT: movw %di, 2(%ebp)
+; SSE-NEXT: movw %si, (%ebp)
+; SSE-NEXT: movl %ebp, %eax
+; SSE-NEXT: addl $12, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: retl $4
+ ret <4 x bfloat> %x
+}
+
+define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v8bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $12, %esp
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %edi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebx
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 14(%ebp)
+; NOSSE-NEXT: movw %bx, 12(%ebp)
+; NOSSE-NEXT: movw %di, 10(%ebp)
+; NOSSE-NEXT: movw %si, 8(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 6(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 4(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 2(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, (%ebp)
+; NOSSE-NEXT: movl %ebp, %eax
+; NOSSE-NEXT: addl $12, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v8bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $12, %esp
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %edi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %ebx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 14(%ebp)
+; SSE-NEXT: movw %bx, 12(%ebp)
+; SSE-NEXT: movw %di, 10(%ebp)
+; SSE-NEXT: movw %si, 8(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 6(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 4(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 2(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, (%ebp)
+; SSE-NEXT: movl %ebp, %eax
+; SSE-NEXT: addl $12, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: retl $4
+ ret <8 x bfloat> %x
+}
+
+define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
+; NOSSE-LABEL: return_arg_v16bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $28, %esp
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebx
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebp
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 30(%edi)
+; NOSSE-NEXT: movw %bp, 28(%edi)
+; NOSSE-NEXT: movw %bx, 26(%edi)
+; NOSSE-NEXT: movw %si, 24(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 22(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 20(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 18(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 16(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 14(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 12(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 10(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 8(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 6(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 4(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 2(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, (%edi)
+; NOSSE-NEXT: movl %edi, %eax
+; NOSSE-NEXT: addl $28, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v16bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $28, %esp
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %ebx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %ebp
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 30(%edi)
+; SSE-NEXT: movw %bp, 28(%edi)
+; SSE-NEXT: movw %bx, 26(%edi)
+; SSE-NEXT: movw %si, 24(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 22(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 20(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 18(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 16(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 14(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 12(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 10(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 8(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 6(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 4(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 2(%edi)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, (%edi)
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: addl $28, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: retl $4
+ ret <16 x bfloat> %x
+}
+
+declare bfloat @returns_bf16(bfloat)
+declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>)
+declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>)
+declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>)
+declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>)
+declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>)
+
+define void @call_ret_bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $8, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT: movzwl (%esi), %eax
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll returns_bf16 at PLT
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, (%esi)
+; NOSSE-NEXT: addl $8, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: retl
+;
+; SSE-LABEL: call_ret_bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $8, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT: movzwl (%esi), %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll returns_bf16 at PLT
+; SSE-NEXT: fstps (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, (%esi)
+; SSE-NEXT: addl $8, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: retl
+ %val = load bfloat, ptr %ptr
+ %bf16 = call bfloat @returns_bf16(bfloat %val)
+ store bfloat %bf16, ptr %ptr
+ ret void
+}
+
+define void @call_ret_v2bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v2bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $20, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT: movzwl 2(%edi), %eax
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl (%edi), %eax
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll returns_v2bf16 at PLT
+; NOSSE-NEXT: fxch %st(1)
+; NOSSE-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 2(%edi)
+; NOSSE-NEXT: movw %si, (%edi)
+; NOSSE-NEXT: addl $20, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: retl
+;
+; SSE-LABEL: call_ret_v2bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $36, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT: movzwl 2(%edi), %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl (%edi), %eax
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll returns_v2bf16 at PLT
+; SSE-NEXT: fxch %st(1)
+; SSE-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; SSE-NEXT: fstps (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; SSE-NEXT: fstps (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 2(%edi)
+; SSE-NEXT: movw %si, (%edi)
+; SSE-NEXT: addl $36, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: retl
+ %val = load <2 x bfloat>, ptr %ptr
+ %bf16 = call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val)
+ store <2 x bfloat> %bf16, ptr %ptr
+ ret void
+}
+
+define void @call_ret_v3bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v3bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $40, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT: movl (%esi), %eax
+; NOSSE-NEXT: movl 4(%esi), %ecx
+; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT: movl %edx, (%esp)
+; NOSSE-NEXT: shll $16, %ecx
+; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl %eax, %ecx
+; NOSSE-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
+; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: calll returns_v3bf16 at PLT
+; NOSSE-NEXT: subl $4, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT: movw %cx, 4(%esi)
+; NOSSE-NEXT: movl %eax, (%esi)
+; NOSSE-NEXT: addl $40, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: retl
+;
+; SSE-LABEL: call_ret_v3bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $40, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT: movl (%esi), %eax
+; SSE-NEXT: movl 4(%esi), %ecx
+; SSE-NEXT: leal {{[0-9]+}}(%esp), %edx
+; SSE-NEXT: movl %edx, (%esp)
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
+; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: calll returns_v3bf16 at PLT
+; SSE-NEXT: subl $4, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT: movw %cx, 4(%esi)
+; SSE-NEXT: movl %eax, (%esi)
+; SSE-NEXT: addl $40, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: retl
+ %val = load <3 x bfloat>, ptr %ptr
+ %bf16 = call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val)
+ store <3 x bfloat> %bf16, ptr %ptr
+ ret void
+}
+
+define void @call_ret_v4bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v4bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $48, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT: movzwl 2(%esi), %ecx
+; NOSSE-NEXT: movl (%esi), %eax
+; NOSSE-NEXT: movl 4(%esi), %edx
+; NOSSE-NEXT: movzwl 6(%esi), %edi
+; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT: movl %ebx, (%esp)
+; NOSSE-NEXT: shll $16, %edi
+; NOSSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %edx
+; NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %ecx
+; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: calll returns_v4bf16 at PLT
+; NOSSE-NEXT: subl $4, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT: movw %dx, 6(%esi)
+; NOSSE-NEXT: movw %cx, 4(%esi)
+; NOSSE-NEXT: movl %eax, (%esi)
+; NOSSE-NEXT: addl $48, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: retl
+;
+; SSE-LABEL: call_ret_v4bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $48, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT: movzwl 2(%esi), %ecx
+; SSE-NEXT: movl (%esi), %eax
+; SSE-NEXT: movl 4(%esi), %edx
+; SSE-NEXT: movzwl 6(%esi), %edi
+; SSE-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT: movl %ebx, (%esp)
+; SSE-NEXT: shll $16, %edi
+; SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: calll returns_v4bf16 at PLT
+; SSE-NEXT: subl $4, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT: movw %dx, 6(%esi)
+; SSE-NEXT: movw %cx, 4(%esi)
+; SSE-NEXT: movl %eax, (%esi)
+; SSE-NEXT: addl $48, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: retl
+ %val = load <4 x bfloat>, ptr %ptr
+ %bf16 = call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val)
+ store <4 x bfloat> %bf16, ptr %ptr
+ ret void
+}
+
+define void @call_ret_v8bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v8bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: subl $108, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT: movzwl 2(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl (%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 4(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl 6(%esi), %edi
+; NOSSE-NEXT: movl 8(%esi), %ebx
+; NOSSE-NEXT: movzwl 10(%esi), %ebp
+; NOSSE-NEXT: movl 12(%esi), %ecx
+; NOSSE-NEXT: movzwl 14(%esi), %eax
+; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT: movl %edx, (%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %ecx
+; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %ebp
+; NOSSE-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %ebx
+; NOSSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %edi
+; NOSSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: calll returns_v8bf16 at PLT
+; NOSSE-NEXT: subl $4, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movw %ax, 14(%esi)
+; NOSSE-NEXT: movw %bp, 12(%esi)
+; NOSSE-NEXT: movw %bx, 10(%esi)
+; NOSSE-NEXT: movw %di, 8(%esi)
+; NOSSE-NEXT: movw %dx, 6(%esi)
+; NOSSE-NEXT: movw %cx, 4(%esi)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: movl %eax, (%esi)
+; NOSSE-NEXT: addl $108, %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: retl
+;
+; SSE-LABEL: call_ret_v8bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: subl $108, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT: movzwl 2(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl (%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 4(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl 6(%esi), %edi
+; SSE-NEXT: movl 8(%esi), %ebx
+; SSE-NEXT: movzwl 10(%esi), %ebp
+; SSE-NEXT: movl 12(%esi), %ecx
+; SSE-NEXT: movzwl 14(%esi), %eax
+; SSE-NEXT: leal {{[0-9]+}}(%esp), %edx
+; SSE-NEXT: movl %edx, (%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %ebp
+; SSE-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %ebx
+; SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %edi
+; SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: calll returns_v8bf16 at PLT
+; SSE-NEXT: subl $4, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movw %ax, 14(%esi)
+; SSE-NEXT: movw %bp, 12(%esi)
+; SSE-NEXT: movw %bx, 10(%esi)
+; SSE-NEXT: movw %di, 8(%esi)
+; SSE-NEXT: movw %dx, 6(%esi)
+; SSE-NEXT: movw %cx, 4(%esi)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: movl %eax, (%esi)
+; SSE-NEXT: addl $108, %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: retl
+ %val = load <8 x bfloat>, ptr %ptr
+ %bf16 = call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val)
+ store <8 x bfloat> %bf16, ptr %ptr
+ ret void
+}
+
+define void @call_ret_v16bf16(ptr %ptr) #0 {
+; NOSSE-LABEL: call_ret_v16bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: movl %esp, %ebp
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: andl $-32, %esp
+; NOSSE-NEXT: subl $256, %esp # imm = 0x100
+; NOSSE-NEXT: movl 8(%ebp), %esi
+; NOSSE-NEXT: movzwl 2(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl (%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 4(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl 6(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 8(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl 10(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 12(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl 14(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 16(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl 18(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 20(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movzwl 22(%esi), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl 24(%esi), %edi
+; NOSSE-NEXT: movzwl 26(%esi), %edx
+; NOSSE-NEXT: movl 28(%esi), %ecx
+; NOSSE-NEXT: movzwl 30(%esi), %eax
+; NOSSE-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT: movl %ebx, (%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %ecx
+; NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %edx
+; NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: shll $16, %edi
+; NOSSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT: calll returns_v16bf16 at PLT
+; NOSSE-NEXT: subl $4, %esp
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT: movl %edx, 28(%esi)
+; NOSSE-NEXT: movl %eax, 24(%esi)
+; NOSSE-NEXT: movl %ecx, 20(%esi)
+; NOSSE-NEXT: movl %ebx, 16(%esi)
+; NOSSE-NEXT: movl %edi, 12(%esi)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: movl %eax, 8(%esi)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: movl %eax, 4(%esi)
+; NOSSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT: movl %eax, (%esi)
+; NOSSE-NEXT: leal -12(%ebp), %esp
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: retl
+;
+; SSE-LABEL: call_ret_v16bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: movl %esp, %ebp
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: andl $-32, %esp
+; SSE-NEXT: subl $256, %esp # imm = 0x100
+; SSE-NEXT: movl 8(%ebp), %esi
+; SSE-NEXT: movzwl 2(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl (%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 4(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl 6(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 8(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl 10(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 12(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl 14(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 16(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl 18(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 20(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movzwl 22(%esi), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl 24(%esi), %edi
+; SSE-NEXT: movzwl 26(%esi), %edx
+; SSE-NEXT: movl 28(%esi), %ecx
+; SSE-NEXT: movzwl 30(%esi), %eax
+; SSE-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT: movl %ebx, (%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: shll $16, %edi
+; SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT: calll returns_v16bf16 at PLT
+; SSE-NEXT: subl $4, %esp
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT: movl %edx, 28(%esi)
+; SSE-NEXT: movl %eax, 24(%esi)
+; SSE-NEXT: movl %ecx, 20(%esi)
+; SSE-NEXT: movl %ebx, 16(%esi)
+; SSE-NEXT: movl %edi, 12(%esi)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: movl %eax, 8(%esi)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: movl %eax, 4(%esi)
+; SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT: movl %eax, (%esi)
+; SSE-NEXT: leal -12(%ebp), %esp
+; SSE-NEXT: popl %esi
+; SSE-NEXT: popl %edi
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: retl
+ %val = load <16 x bfloat>, ptr %ptr
+ %bf16 = call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val)
+ store <16 x bfloat> %bf16, ptr %ptr
+ ret void
+}
+
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list