[llvm] X86: Fix asserting on bfloat argument/return without sse2 (PR #93146)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed May 29 04:30:38 PDT 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/93146

>From 9b8131d23bb6c83a35f6a3ea361c512adeddf5b1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 23 May 2024 07:56:23 +0200
Subject: [PATCH 1/4] X86: Fix asserting on bfloat argument/return without sse2

These now get the default promote-to-float behavior, like half does.

Fixes #92899
---
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |   18 +-
 .../X86/bfloat-calling-conv-no-sse2.ll        | 1441 +++++++++++++++++
 2 files changed, 1452 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll

diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b107d56f8cf9b..4fddb60db5f10 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -123,12 +123,14 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
       !Subtarget.hasX87())
     return MVT::i32;
 
-  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
-    return getRegisterTypeForCallingConv(Context, CC,
-                                         VT.changeVectorElementType(MVT::f16));
+  if (isTypeLegal(MVT::f16)) {
+    if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+      return getRegisterTypeForCallingConv(
+          Context, CC, VT.changeVectorElementType(MVT::f16));
 
-  if (VT == MVT::bf16)
-    return MVT::f16;
+    if (VT == MVT::bf16)
+      return MVT::f16;
+  }
 
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
@@ -161,7 +163,8 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
       return 3;
   }
 
-  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+      isTypeLegal(MVT::f16))
     return getNumRegistersForCallingConv(Context, CC,
                                          VT.changeVectorElementType(MVT::f16));
 
@@ -193,7 +196,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
   }
 
   // Split vNbf16 vectors according to vNf16.
-  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+      isTypeLegal(MVT::f16))
     VT = VT.changeVectorElementType(MVT::f16);
 
   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
new file mode 100644
index 0000000000000..05f036201f4b1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
@@ -0,0 +1,1441 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-linux-gnu < %s | FileCheck -check-prefixes=CHECK,NOSSE %s
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse < %s | FileCheck -check-prefixes=CHECK,SSE %s
+
+; Make sure no assert without SSE2 and bfloat. Issue 92899
+
+define bfloat @return_arg_bf16(bfloat %x) {
+; CHECK-LABEL: return_arg_bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    retl
+  ret bfloat %x
+}
+
+define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) {
+; CHECK-LABEL: return_arg_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    retl
+  ret <2 x bfloat> %x
+}
+
+define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v3bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    pushl %eax
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    .cfi_offset %esi, -12
+; NOSSE-NEXT:    .cfi_offset %edi, -8
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    # kill: def $ax killed $ax def $eax
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movzwl %si, %edi
+; NOSSE-NEXT:    orl %eax, %edi
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 4(%esi)
+; NOSSE-NEXT:    movl %edi, (%esi)
+; NOSSE-NEXT:    movl %esi, %eax
+; NOSSE-NEXT:    addl $4, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v3bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    pushl %eax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    .cfi_offset %esi, -12
+; SSE-NEXT:    .cfi_offset %edi, -8
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    # kill: def $ax killed $ax def $eax
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movzwl %si, %edi
+; SSE-NEXT:    orl %eax, %edi
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 4(%esi)
+; SSE-NEXT:    movl %edi, (%esi)
+; SSE-NEXT:    movl %esi, %eax
+; SSE-NEXT:    addl $4, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl $4
+  ret <3 x bfloat> %x
+}
+
+define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v4bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    subl $12, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 32
+; NOSSE-NEXT:    .cfi_offset %esi, -20
+; NOSSE-NEXT:    .cfi_offset %edi, -16
+; NOSSE-NEXT:    .cfi_offset %ebx, -12
+; NOSSE-NEXT:    .cfi_offset %ebp, -8
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %edi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebx
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 6(%ebp)
+; NOSSE-NEXT:    movw %bx, 4(%ebp)
+; NOSSE-NEXT:    movw %di, 2(%ebp)
+; NOSSE-NEXT:    movw %si, (%ebp)
+; NOSSE-NEXT:    movl %ebp, %eax
+; NOSSE-NEXT:    addl $12, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v4bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    subl $12, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 32
+; SSE-NEXT:    .cfi_offset %esi, -20
+; SSE-NEXT:    .cfi_offset %edi, -16
+; SSE-NEXT:    .cfi_offset %ebx, -12
+; SSE-NEXT:    .cfi_offset %ebp, -8
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %edi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebx
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 6(%ebp)
+; SSE-NEXT:    movw %bx, 4(%ebp)
+; SSE-NEXT:    movw %di, 2(%ebp)
+; SSE-NEXT:    movw %si, (%ebp)
+; SSE-NEXT:    movl %ebp, %eax
+; SSE-NEXT:    addl $12, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl $4
+  ret <4 x bfloat> %x
+}
+
+define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v8bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    subl $12, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 32
+; NOSSE-NEXT:    .cfi_offset %esi, -20
+; NOSSE-NEXT:    .cfi_offset %edi, -16
+; NOSSE-NEXT:    .cfi_offset %ebx, -12
+; NOSSE-NEXT:    .cfi_offset %ebp, -8
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %edi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebx
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 14(%ebp)
+; NOSSE-NEXT:    movw %bx, 12(%ebp)
+; NOSSE-NEXT:    movw %di, 10(%ebp)
+; NOSSE-NEXT:    movw %si, 8(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 6(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 4(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 2(%ebp)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, (%ebp)
+; NOSSE-NEXT:    movl %ebp, %eax
+; NOSSE-NEXT:    addl $12, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v8bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    subl $12, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 32
+; SSE-NEXT:    .cfi_offset %esi, -20
+; SSE-NEXT:    .cfi_offset %edi, -16
+; SSE-NEXT:    .cfi_offset %ebx, -12
+; SSE-NEXT:    .cfi_offset %ebp, -8
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %edi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebx
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 14(%ebp)
+; SSE-NEXT:    movw %bx, 12(%ebp)
+; SSE-NEXT:    movw %di, 10(%ebp)
+; SSE-NEXT:    movw %si, 8(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 6(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 4(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 2(%ebp)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, (%ebp)
+; SSE-NEXT:    movl %ebp, %eax
+; SSE-NEXT:    addl $12, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl $4
+  ret <8 x bfloat> %x
+}
+
+define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v16bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    subl $28, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 48
+; NOSSE-NEXT:    .cfi_offset %esi, -20
+; NOSSE-NEXT:    .cfi_offset %edi, -16
+; NOSSE-NEXT:    .cfi_offset %ebx, -12
+; NOSSE-NEXT:    .cfi_offset %ebp, -8
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebx
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %ebp
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 30(%edi)
+; NOSSE-NEXT:    movw %bp, 28(%edi)
+; NOSSE-NEXT:    movw %bx, 26(%edi)
+; NOSSE-NEXT:    movw %si, 24(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 22(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 20(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 18(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 16(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 14(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 12(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 10(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 8(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 6(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 4(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, 2(%edi)
+; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    movw %ax, (%edi)
+; NOSSE-NEXT:    movl %edi, %eax
+; NOSSE-NEXT:    addl $28, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl $4
+;
+; SSE-LABEL: return_arg_v16bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    subl $28, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 48
+; SSE-NEXT:    .cfi_offset %esi, -20
+; SSE-NEXT:    .cfi_offset %edi, -16
+; SSE-NEXT:    .cfi_offset %ebx, -12
+; SSE-NEXT:    .cfi_offset %ebp, -8
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebx
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %ebp
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 30(%edi)
+; SSE-NEXT:    movw %bp, 28(%edi)
+; SSE-NEXT:    movw %bx, 26(%edi)
+; SSE-NEXT:    movw %si, 24(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 22(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 20(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 18(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 16(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 14(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 12(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 10(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 8(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 6(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 4(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, 2(%edi)
+; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    movw %ax, (%edi)
+; SSE-NEXT:    movl %edi, %eax
+; SSE-NEXT:    addl $28, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl $4
+  ret <16 x bfloat> %x
+}
+
+declare bfloat @returns_bf16(bfloat)
+declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>)
+declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>)
+declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>)
+declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>)
+declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>)
+
+define void @call_ret_bf16(ptr %ptr) {
+; NOSSE-LABEL: call_ret_bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    subl $8, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    .cfi_offset %esi, -8
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl (%esi), %eax
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll returns_bf16 at PLT
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, (%esi)
+; NOSSE-NEXT:    addl $8, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    subl $8, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    .cfi_offset %esi, -8
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl (%esi), %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll returns_bf16 at PLT
+; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, (%esi)
+; SSE-NEXT:    addl $8, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl
+  %val = load bfloat, ptr %ptr
+  %bf16 = call bfloat @returns_bf16(bfloat %val)
+  store bfloat %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v2bf16(ptr %ptr) {
+; NOSSE-LABEL: call_ret_v2bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    subl $20, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 32
+; NOSSE-NEXT:    .cfi_offset %esi, -12
+; NOSSE-NEXT:    .cfi_offset %edi, -8
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movzwl 2(%edi), %eax
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl (%edi), %eax
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll returns_v2bf16 at PLT
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movl %eax, %esi
+; NOSSE-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, 2(%edi)
+; NOSSE-NEXT:    movw %si, (%edi)
+; NOSSE-NEXT:    addl $20, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v2bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    subl $36, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 48
+; SSE-NEXT:    .cfi_offset %esi, -12
+; SSE-NEXT:    .cfi_offset %edi, -8
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movzwl 2(%edi), %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl (%edi), %eax
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, (%esp)
+; SSE-NEXT:    calll returns_v2bf16 at PLT
+; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, 2(%edi)
+; SSE-NEXT:    movw %si, (%edi)
+; SSE-NEXT:    addl $36, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl
+  %val = load <2 x bfloat>, ptr %ptr
+  %bf16 = call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val)
+  store <2 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v3bf16(ptr %ptr) {
+; NOSSE-LABEL: call_ret_v3bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    subl $40, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 48
+; NOSSE-NEXT:    .cfi_offset %esi, -8
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl 4(%esi), %ecx
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %edx, (%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl %eax, %ecx
+; NOSSE-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v3bf16 at PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movw %cx, 4(%esi)
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    addl $40, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v3bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    subl $40, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 48
+; SSE-NEXT:    .cfi_offset %esi, -8
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl 4(%esi), %ecx
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movl %edx, (%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v3bf16 at PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movw %cx, 4(%esi)
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    addl $40, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl
+  %val = load <3 x bfloat>, ptr %ptr
+  %bf16 = call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val)
+  store <3 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v4bf16(ptr %ptr) {
+; NOSSE-LABEL: call_ret_v4bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    subl $48, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 64
+; NOSSE-NEXT:    .cfi_offset %esi, -16
+; NOSSE-NEXT:    .cfi_offset %edi, -12
+; NOSSE-NEXT:    .cfi_offset %ebx, -8
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %ecx
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl 4(%esi), %edx
+; NOSSE-NEXT:    movzwl 6(%esi), %edi
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movl %ebx, (%esp)
+; NOSSE-NEXT:    shll $16, %edi
+; NOSSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edx
+; NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v4bf16 at PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movw %dx, 6(%esi)
+; NOSSE-NEXT:    movw %cx, 4(%esi)
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    addl $48, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v4bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    subl $48, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 64
+; SSE-NEXT:    .cfi_offset %esi, -16
+; SSE-NEXT:    .cfi_offset %edi, -12
+; SSE-NEXT:    .cfi_offset %ebx, -8
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %ecx
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl 4(%esi), %edx
+; SSE-NEXT:    movzwl 6(%esi), %edi
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movl %ebx, (%esp)
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v4bf16 at PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movw %dx, 6(%esi)
+; SSE-NEXT:    movw %cx, 4(%esi)
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    addl $48, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl
+  %val = load <4 x bfloat>, ptr %ptr
+  %bf16 = call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val)
+  store <4 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v8bf16(ptr %ptr) {
+; NOSSE-LABEL: call_ret_v8bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    subl $108, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 128
+; NOSSE-NEXT:    .cfi_offset %esi, -20
+; NOSSE-NEXT:    .cfi_offset %edi, -16
+; NOSSE-NEXT:    .cfi_offset %ebx, -12
+; NOSSE-NEXT:    .cfi_offset %ebp, -8
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 4(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 6(%esi), %edi
+; NOSSE-NEXT:    movl 8(%esi), %ebx
+; NOSSE-NEXT:    movzwl 10(%esi), %ebp
+; NOSSE-NEXT:    movl 12(%esi), %ecx
+; NOSSE-NEXT:    movzwl 14(%esi), %eax
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %edx, (%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ebp
+; NOSSE-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ebx
+; NOSSE-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edi
+; NOSSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v8bf16 at PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movw %ax, 14(%esi)
+; NOSSE-NEXT:    movw %bp, 12(%esi)
+; NOSSE-NEXT:    movw %bx, 10(%esi)
+; NOSSE-NEXT:    movw %di, 8(%esi)
+; NOSSE-NEXT:    movw %dx, 6(%esi)
+; NOSSE-NEXT:    movw %cx, 4(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    addl $108, %esp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 20
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v8bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    subl $108, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 128
+; SSE-NEXT:    .cfi_offset %esi, -20
+; SSE-NEXT:    .cfi_offset %edi, -16
+; SSE-NEXT:    .cfi_offset %ebx, -12
+; SSE-NEXT:    .cfi_offset %ebp, -8
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 4(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 6(%esi), %edi
+; SSE-NEXT:    movl 8(%esi), %ebx
+; SSE-NEXT:    movzwl 10(%esi), %ebp
+; SSE-NEXT:    movl 12(%esi), %ecx
+; SSE-NEXT:    movzwl 14(%esi), %eax
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movl %edx, (%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ebp
+; SSE-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ebx
+; SSE-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v8bf16 at PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movw %ax, 14(%esi)
+; SSE-NEXT:    movw %bp, 12(%esi)
+; SSE-NEXT:    movw %bx, 10(%esi)
+; SSE-NEXT:    movw %di, 8(%esi)
+; SSE-NEXT:    movw %dx, 6(%esi)
+; SSE-NEXT:    movw %cx, 4(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    addl $108, %esp
+; SSE-NEXT:    .cfi_def_cfa_offset 20
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    .cfi_def_cfa_offset 12
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 4
+; SSE-NEXT:    retl
+  %val = load <8 x bfloat>, ptr %ptr
+  %bf16 = call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val)
+  store <8 x bfloat> %bf16, ptr %ptr
+  ret void
+}
+
+define void @call_ret_v16bf16(ptr %ptr) {
+; NOSSE-LABEL: call_ret_v16bf16:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    pushl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; NOSSE-NEXT:    .cfi_offset %ebp, -8
+; NOSSE-NEXT:    movl %esp, %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_register %ebp
+; NOSSE-NEXT:    pushl %ebx
+; NOSSE-NEXT:    pushl %edi
+; NOSSE-NEXT:    pushl %esi
+; NOSSE-NEXT:    andl $-32, %esp
+; NOSSE-NEXT:    subl $256, %esp # imm = 0x100
+; NOSSE-NEXT:    .cfi_offset %esi, -20
+; NOSSE-NEXT:    .cfi_offset %edi, -16
+; NOSSE-NEXT:    .cfi_offset %ebx, -12
+; NOSSE-NEXT:    movl 8(%ebp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl (%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 4(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 6(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 8(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 10(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 12(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 14(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 16(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 18(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 20(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movzwl 22(%esi), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl 24(%esi), %edi
+; NOSSE-NEXT:    movzwl 26(%esi), %edx
+; NOSSE-NEXT:    movl 28(%esi), %ecx
+; NOSSE-NEXT:    movzwl 30(%esi), %eax
+; NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movl %ebx, (%esp)
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edx
+; NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    shll $16, %edi
+; NOSSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    shll $16, %eax
+; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    calll returns_v16bf16 at PLT
+; NOSSE-NEXT:    subl $4, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %edx, 28(%esi)
+; NOSSE-NEXT:    movl %eax, 24(%esi)
+; NOSSE-NEXT:    movl %ecx, 20(%esi)
+; NOSSE-NEXT:    movl %ebx, 16(%esi)
+; NOSSE-NEXT:    movl %edi, 12(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, 8(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, 4(%esi)
+; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; NOSSE-NEXT:    movl %eax, (%esi)
+; NOSSE-NEXT:    leal -12(%ebp), %esp
+; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    popl %edi
+; NOSSE-NEXT:    popl %ebx
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa %esp, 4
+; NOSSE-NEXT:    retl
+;
+; SSE-LABEL: call_ret_v16bf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    .cfi_offset %ebp, -8
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    .cfi_def_cfa_register %ebp
+; SSE-NEXT:    pushl %ebx
+; SSE-NEXT:    pushl %edi
+; SSE-NEXT:    pushl %esi
+; SSE-NEXT:    andl $-32, %esp
+; SSE-NEXT:    subl $256, %esp # imm = 0x100
+; SSE-NEXT:    .cfi_offset %esi, -20
+; SSE-NEXT:    .cfi_offset %edi, -16
+; SSE-NEXT:    .cfi_offset %ebx, -12
+; SSE-NEXT:    movl 8(%ebp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl (%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 4(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 6(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 8(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 10(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 12(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 14(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 16(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 18(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 20(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movzwl 22(%esi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl 24(%esi), %edi
+; SSE-NEXT:    movzwl 26(%esi), %edx
+; SSE-NEXT:    movl 28(%esi), %ecx
+; SSE-NEXT:    movzwl 30(%esi), %eax
+; SSE-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movl %ebx, (%esp)
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edx
+; SSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    calll returns_v16bf16 at PLT
+; SSE-NEXT:    subl $4, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SSE-NEXT:    movl %edx, 28(%esi)
+; SSE-NEXT:    movl %eax, 24(%esi)
+; SSE-NEXT:    movl %ecx, 20(%esi)
+; SSE-NEXT:    movl %ebx, 16(%esi)
+; SSE-NEXT:    movl %edi, 12(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, 8(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, 4(%esi)
+; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; SSE-NEXT:    movl %eax, (%esi)
+; SSE-NEXT:    leal -12(%ebp), %esp
+; SSE-NEXT:    popl %esi
+; SSE-NEXT:    popl %edi
+; SSE-NEXT:    popl %ebx
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    .cfi_def_cfa %esp, 4
+; SSE-NEXT:    retl
+  %val = load <16 x bfloat>, ptr %ptr
+  %bf16 = call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val)
+  store <16 x bfloat> %bf16, ptr %ptr
+  ret void
+}

>From ba95e556e0618361490917632734f1cef0ba2eeb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 23 May 2024 10:14:02 +0200
Subject: [PATCH 2/4] Add nounwind

---
 .../X86/bfloat-calling-conv-no-sse2.ll        | 226 ++----------------
 1 file changed, 14 insertions(+), 212 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
index 05f036201f4b1..f363cad816dfb 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
@@ -4,7 +4,7 @@
 
 ; Make sure no assert without SSE2 and bfloat. Issue 92899
 
-define bfloat @return_arg_bf16(bfloat %x) {
+define bfloat @return_arg_bf16(bfloat %x) #0 {
 ; CHECK-LABEL: return_arg_bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
@@ -12,7 +12,7 @@ define bfloat @return_arg_bf16(bfloat %x) {
   ret bfloat %x
 }
 
-define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) {
+define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 {
 ; CHECK-LABEL: return_arg_v2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
@@ -21,17 +21,12 @@ define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) {
   ret <2 x bfloat> %x
 }
 
-define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) {
+define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
 ; NOSSE-LABEL: return_arg_v3bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    pushl %eax
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
-; NOSSE-NEXT:    .cfi_offset %esi, -12
-; NOSSE-NEXT:    .cfi_offset %edi, -8
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -51,23 +46,15 @@ define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) {
 ; NOSSE-NEXT:    movl %edi, (%esi)
 ; NOSSE-NEXT:    movl %esi, %eax
 ; NOSSE-NEXT:    addl $4, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: return_arg_v3bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    pushl %eax
-; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    .cfi_offset %esi, -12
-; SSE-NEXT:    .cfi_offset %edi, -8
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -87,32 +74,20 @@ define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) {
 ; SSE-NEXT:    movl %edi, (%esi)
 ; SSE-NEXT:    movl %esi, %eax
 ; SSE-NEXT:    addl $4, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl $4
   ret <3 x bfloat> %x
 }
 
-define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) {
+define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
 ; NOSSE-LABEL: return_arg_v4bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    subl $12, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 32
-; NOSSE-NEXT:    .cfi_offset %esi, -20
-; NOSSE-NEXT:    .cfi_offset %edi, -16
-; NOSSE-NEXT:    .cfi_offset %ebx, -12
-; NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -135,33 +110,19 @@ define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) {
 ; NOSSE-NEXT:    movw %si, (%ebp)
 ; NOSSE-NEXT:    movl %ebp, %eax
 ; NOSSE-NEXT:    addl $12, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: return_arg_v4bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    subl $12, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 32
-; SSE-NEXT:    .cfi_offset %esi, -20
-; SSE-NEXT:    .cfi_offset %edi, -16
-; SSE-NEXT:    .cfi_offset %ebx, -12
-; SSE-NEXT:    .cfi_offset %ebp, -8
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -184,36 +145,22 @@ define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) {
 ; SSE-NEXT:    movw %si, (%ebp)
 ; SSE-NEXT:    movl %ebp, %eax
 ; SSE-NEXT:    addl $12, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl $4
   ret <4 x bfloat> %x
 }
 
-define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) {
+define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
 ; NOSSE-LABEL: return_arg_v8bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    subl $12, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 32
-; NOSSE-NEXT:    .cfi_offset %esi, -20
-; NOSSE-NEXT:    .cfi_offset %edi, -16
-; NOSSE-NEXT:    .cfi_offset %ebx, -12
-; NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -260,33 +207,19 @@ define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) {
 ; NOSSE-NEXT:    movw %ax, (%ebp)
 ; NOSSE-NEXT:    movl %ebp, %eax
 ; NOSSE-NEXT:    addl $12, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: return_arg_v8bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    subl $12, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 32
-; SSE-NEXT:    .cfi_offset %esi, -20
-; SSE-NEXT:    .cfi_offset %edi, -16
-; SSE-NEXT:    .cfi_offset %ebx, -12
-; SSE-NEXT:    .cfi_offset %ebp, -8
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -333,36 +266,22 @@ define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) {
 ; SSE-NEXT:    movw %ax, (%ebp)
 ; SSE-NEXT:    movl %ebp, %eax
 ; SSE-NEXT:    addl $12, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl $4
   ret <8 x bfloat> %x
 }
 
-define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) {
+define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; NOSSE-LABEL: return_arg_v16bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    subl $28, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 48
-; NOSSE-NEXT:    .cfi_offset %esi, -20
-; NOSSE-NEXT:    .cfi_offset %edi, -16
-; NOSSE-NEXT:    .cfi_offset %ebx, -12
-; NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -457,33 +376,19 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) {
 ; NOSSE-NEXT:    movw %ax, (%edi)
 ; NOSSE-NEXT:    movl %edi, %eax
 ; NOSSE-NEXT:    addl $28, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: return_arg_v16bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    subl $28, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 48
-; SSE-NEXT:    .cfi_offset %esi, -20
-; SSE-NEXT:    .cfi_offset %edi, -16
-; SSE-NEXT:    .cfi_offset %ebx, -12
-; SSE-NEXT:    .cfi_offset %ebp, -8
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -578,15 +483,10 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) {
 ; SSE-NEXT:    movw %ax, (%edi)
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    addl $28, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl $4
   ret <16 x bfloat> %x
 }
@@ -598,14 +498,11 @@ declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>)
 declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>)
 declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>)
 
-define void @call_ret_bf16(ptr %ptr) {
+define void @call_ret_bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    subl $8, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
-; NOSSE-NEXT:    .cfi_offset %esi, -8
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; NOSSE-NEXT:    movzwl (%esi), %eax
 ; NOSSE-NEXT:    shll $16, %eax
@@ -617,18 +514,13 @@ define void @call_ret_bf16(ptr %ptr) {
 ; NOSSE-NEXT:    calll __truncsfbf2
 ; NOSSE-NEXT:    movw %ax, (%esi)
 ; NOSSE-NEXT:    addl $8, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    subl $8, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    .cfi_offset %esi, -8
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SSE-NEXT:    movzwl (%esi), %eax
 ; SSE-NEXT:    shll $16, %eax
@@ -640,9 +532,7 @@ define void @call_ret_bf16(ptr %ptr) {
 ; SSE-NEXT:    calll __truncsfbf2
 ; SSE-NEXT:    movw %ax, (%esi)
 ; SSE-NEXT:    addl $8, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl
   %val = load bfloat, ptr %ptr
   %bf16 = call bfloat @returns_bf16(bfloat %val)
@@ -650,17 +540,12 @@ define void @call_ret_bf16(ptr %ptr) {
   ret void
 }
 
-define void @call_ret_v2bf16(ptr %ptr) {
+define void @call_ret_v2bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_v2bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    subl $20, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 32
-; NOSSE-NEXT:    .cfi_offset %esi, -12
-; NOSSE-NEXT:    .cfi_offset %edi, -8
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; NOSSE-NEXT:    movzwl 2(%edi), %eax
 ; NOSSE-NEXT:    shll $16, %eax
@@ -684,23 +569,15 @@ define void @call_ret_v2bf16(ptr %ptr) {
 ; NOSSE-NEXT:    movw %ax, 2(%edi)
 ; NOSSE-NEXT:    movw %si, (%edi)
 ; NOSSE-NEXT:    addl $20, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_v2bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    subl $36, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 48
-; SSE-NEXT:    .cfi_offset %esi, -12
-; SSE-NEXT:    .cfi_offset %edi, -8
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; SSE-NEXT:    movzwl 2(%edi), %eax
 ; SSE-NEXT:    shll $16, %eax
@@ -724,11 +601,8 @@ define void @call_ret_v2bf16(ptr %ptr) {
 ; SSE-NEXT:    movw %ax, 2(%edi)
 ; SSE-NEXT:    movw %si, (%edi)
 ; SSE-NEXT:    addl $36, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl
   %val = load <2 x bfloat>, ptr %ptr
   %bf16 = call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val)
@@ -736,14 +610,11 @@ define void @call_ret_v2bf16(ptr %ptr) {
   ret void
 }
 
-define void @call_ret_v3bf16(ptr %ptr) {
+define void @call_ret_v3bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_v3bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    subl $40, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 48
-; NOSSE-NEXT:    .cfi_offset %esi, -8
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; NOSSE-NEXT:    movl (%esi), %eax
 ; NOSSE-NEXT:    movl 4(%esi), %ecx
@@ -769,18 +640,13 @@ define void @call_ret_v3bf16(ptr %ptr) {
 ; NOSSE-NEXT:    movw %cx, 4(%esi)
 ; NOSSE-NEXT:    movl %eax, (%esi)
 ; NOSSE-NEXT:    addl $40, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_v3bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    subl $40, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 48
-; SSE-NEXT:    .cfi_offset %esi, -8
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SSE-NEXT:    movl (%esi), %eax
 ; SSE-NEXT:    movl 4(%esi), %ecx
@@ -806,9 +672,7 @@ define void @call_ret_v3bf16(ptr %ptr) {
 ; SSE-NEXT:    movw %cx, 4(%esi)
 ; SSE-NEXT:    movl %eax, (%esi)
 ; SSE-NEXT:    addl $40, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl
   %val = load <3 x bfloat>, ptr %ptr
   %bf16 = call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val)
@@ -816,20 +680,13 @@ define void @call_ret_v3bf16(ptr %ptr) {
   ret void
 }
 
-define void @call_ret_v4bf16(ptr %ptr) {
+define void @call_ret_v4bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_v4bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    subl $48, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 64
-; NOSSE-NEXT:    .cfi_offset %esi, -16
-; NOSSE-NEXT:    .cfi_offset %edi, -12
-; NOSSE-NEXT:    .cfi_offset %ebx, -8
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; NOSSE-NEXT:    movzwl 2(%esi), %ecx
 ; NOSSE-NEXT:    movl (%esi), %eax
@@ -862,28 +719,17 @@ define void @call_ret_v4bf16(ptr %ptr) {
 ; NOSSE-NEXT:    movw %cx, 4(%esi)
 ; NOSSE-NEXT:    movl %eax, (%esi)
 ; NOSSE-NEXT:    addl $48, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_v4bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    subl $48, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 64
-; SSE-NEXT:    .cfi_offset %esi, -16
-; SSE-NEXT:    .cfi_offset %edi, -12
-; SSE-NEXT:    .cfi_offset %ebx, -8
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SSE-NEXT:    movzwl 2(%esi), %ecx
 ; SSE-NEXT:    movl (%esi), %eax
@@ -916,13 +762,9 @@ define void @call_ret_v4bf16(ptr %ptr) {
 ; SSE-NEXT:    movw %cx, 4(%esi)
 ; SSE-NEXT:    movl %eax, (%esi)
 ; SSE-NEXT:    addl $48, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl
   %val = load <4 x bfloat>, ptr %ptr
   %bf16 = call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val)
@@ -930,23 +772,14 @@ define void @call_ret_v4bf16(ptr %ptr) {
   ret void
 }
 
-define void @call_ret_v8bf16(ptr %ptr) {
+define void @call_ret_v8bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_v8bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    pushl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    pushl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    subl $108, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 128
-; NOSSE-NEXT:    .cfi_offset %esi, -20
-; NOSSE-NEXT:    .cfi_offset %edi, -16
-; NOSSE-NEXT:    .cfi_offset %ebx, -12
-; NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; NOSSE-NEXT:    movzwl 2(%esi), %eax
 ; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1015,33 +848,19 @@ define void @call_ret_v8bf16(ptr %ptr) {
 ; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; NOSSE-NEXT:    movl %eax, (%esi)
 ; NOSSE-NEXT:    addl $108, %esp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 20
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; NOSSE-NEXT:    popl %edi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
 ; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; NOSSE-NEXT:    popl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_v8bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    pushl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    pushl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    subl $108, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 128
-; SSE-NEXT:    .cfi_offset %esi, -20
-; SSE-NEXT:    .cfi_offset %edi, -16
-; SSE-NEXT:    .cfi_offset %ebx, -12
-; SSE-NEXT:    .cfi_offset %ebp, -8
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SSE-NEXT:    movzwl 2(%esi), %eax
 ; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1110,15 +929,10 @@ define void @call_ret_v8bf16(ptr %ptr) {
 ; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; SSE-NEXT:    movl %eax, (%esi)
 ; SSE-NEXT:    addl $108, %esp
-; SSE-NEXT:    .cfi_def_cfa_offset 20
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    popl %edi
-; SSE-NEXT:    .cfi_def_cfa_offset 12
 ; SSE-NEXT:    popl %ebx
-; SSE-NEXT:    .cfi_def_cfa_offset 8
 ; SSE-NEXT:    popl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-NEXT:    retl
   %val = load <8 x bfloat>, ptr %ptr
   %bf16 = call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val)
@@ -1126,22 +940,16 @@ define void @call_ret_v8bf16(ptr %ptr) {
   ret void
 }
 
-define void @call_ret_v16bf16(ptr %ptr) {
+define void @call_ret_v16bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_v16bf16:
 ; NOSSE:       # %bb.0:
 ; NOSSE-NEXT:    pushl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; NOSSE-NEXT:    movl %esp, %ebp
-; NOSSE-NEXT:    .cfi_def_cfa_register %ebp
 ; NOSSE-NEXT:    pushl %ebx
 ; NOSSE-NEXT:    pushl %edi
 ; NOSSE-NEXT:    pushl %esi
 ; NOSSE-NEXT:    andl $-32, %esp
 ; NOSSE-NEXT:    subl $256, %esp # imm = 0x100
-; NOSSE-NEXT:    .cfi_offset %esi, -20
-; NOSSE-NEXT:    .cfi_offset %edi, -16
-; NOSSE-NEXT:    .cfi_offset %ebx, -12
 ; NOSSE-NEXT:    movl 8(%ebp), %esi
 ; NOSSE-NEXT:    movzwl 2(%esi), %eax
 ; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1278,24 +1086,17 @@ define void @call_ret_v16bf16(ptr %ptr) {
 ; NOSSE-NEXT:    popl %edi
 ; NOSSE-NEXT:    popl %ebx
 ; NOSSE-NEXT:    popl %ebp
-; NOSSE-NEXT:    .cfi_def_cfa %esp, 4
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_v16bf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushl %ebp
-; SSE-NEXT:    .cfi_def_cfa_offset 8
-; SSE-NEXT:    .cfi_offset %ebp, -8
 ; SSE-NEXT:    movl %esp, %ebp
-; SSE-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-NEXT:    pushl %ebx
 ; SSE-NEXT:    pushl %edi
 ; SSE-NEXT:    pushl %esi
 ; SSE-NEXT:    andl $-32, %esp
 ; SSE-NEXT:    subl $256, %esp # imm = 0x100
-; SSE-NEXT:    .cfi_offset %esi, -20
-; SSE-NEXT:    .cfi_offset %edi, -16
-; SSE-NEXT:    .cfi_offset %ebx, -12
 ; SSE-NEXT:    movl 8(%ebp), %esi
 ; SSE-NEXT:    movzwl 2(%esi), %eax
 ; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1432,10 +1233,11 @@ define void @call_ret_v16bf16(ptr %ptr) {
 ; SSE-NEXT:    popl %edi
 ; SSE-NEXT:    popl %ebx
 ; SSE-NEXT:    popl %ebp
-; SSE-NEXT:    .cfi_def_cfa %esp, 4
 ; SSE-NEXT:    retl
   %val = load <16 x bfloat>, ptr %ptr
   %bf16 = call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val)
   store <16 x bfloat> %bf16, ptr %ptr
   ret void
 }
+
+attributes #0 = { nounwind }

>From 27f03318f805dca4b869a1dfb813191fe63d2036 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 29 May 2024 13:16:04 +0200
Subject: [PATCH 3/4] Fix new test

---
 llvm/test/CodeGen/X86/atomic-non-integer.ll | 55 ++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index d7633cb11e44c..22c60b8cfeda8 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -789,12 +789,55 @@ define double @load_double_seq_cst(ptr %fptr) {
 }
 
 define void @store_bfloat(ptr %fptr, bfloat %v) {
-; X86-LABEL: store_bfloat:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    retl
+; X86-SSE1-LABEL: store_bfloat:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
+; X86-SSE1-NEXT:    .cfi_offset %esi, -8
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE1-NEXT:    calll __truncsfbf2
+; X86-SSE1-NEXT:    movw %ax, (%esi)
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: store_bfloat:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movw %cx, (%eax)
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: store_bfloat:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movw %cx, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X86-NOSSE-LABEL: store_bfloat:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-NOSSE-NEXT:    .cfi_offset %esi, -8
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    calll __truncsfbf2
+; X86-NOSSE-NEXT:    movw %ax, (%esi)
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-NOSSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: store_bfloat:
 ; X64-SSE:       # %bb.0:

>From ed4adaf93d2dc6e5a53988007638c4714e54148f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 29 May 2024 13:16:56 +0200
Subject: [PATCH 4/4] Remove bfloat ABI workaround in atomic test

---
 llvm/test/CodeGen/X86/atomic-non-integer.ll | 30 +++++----------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 22c60b8cfeda8..bd794712a3109 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -854,8 +854,7 @@ define void @store_bfloat(ptr %fptr, bfloat %v) {
   ret void
 }
 
-; Work around issue #92899 by casting to float
-define float @load_bfloat(ptr %fptr) {
+define bfloat @load_bfloat(ptr %fptr) {
 ; X86-SSE1-LABEL: load_bfloat:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %eax
@@ -871,30 +870,16 @@ define float @load_bfloat(ptr %fptr) {
 ;
 ; X86-SSE2-LABEL: load_bfloat:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzwl (%eax), %eax
-; X86-SSE2-NEXT:    shll $16, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, (%esp)
-; X86-SSE2-NEXT:    flds (%esp)
-; X86-SSE2-NEXT:    popl %eax
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT:    pinsrw $0, %eax, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: load_bfloat:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %eax
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movzwl (%eax), %eax
-; X86-AVX-NEXT:    shll $16, %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm0
-; X86-AVX-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX-NEXT:    flds (%esp)
-; X86-AVX-NEXT:    popl %eax
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-NOSSE-LABEL: load_bfloat:
@@ -913,17 +898,14 @@ define float @load_bfloat(ptr %fptr) {
 ; X64-SSE-LABEL: load_bfloat:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movzwl (%rdi), %eax
-; X64-SSE-NEXT:    shll $16, %eax
-; X64-SSE-NEXT:    movd %eax, %xmm0
+; X64-SSE-NEXT:    pinsrw $0, %eax, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: load_bfloat:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movzwl (%rdi), %eax
-; X64-AVX-NEXT:    shll $16, %eax
-; X64-AVX-NEXT:    vmovd %eax, %xmm0
+; X64-AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %v = load atomic bfloat, ptr %fptr unordered, align 2
-  %ext = fpext bfloat %v to float
-  ret float %ext
+  ret bfloat %v
 }



More information about the llvm-commits mailing list