[llvm] 9745c13 - [X86][BF16] Improve float -> bfloat lowering under AVX512BF16 and AVXNECONVERT (#78042)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 16 18:09:30 PST 2024
Author: Phoebe Wang
Date: 2024-01-17T10:09:26+08:00
New Revision: 9745c13ca82538d30a04db7da40d3c4cd9f7f13c
URL: https://github.com/llvm/llvm-project/commit/9745c13ca82538d30a04db7da40d3c4cd9f7f13c
DIFF: https://github.com/llvm/llvm-project/commit/9745c13ca82538d30a04db7da40d3c4cd9f7f13c.diff
LOG: [X86][BF16] Improve float -> bfloat lowering under AVX512BF16 and AVXNECONVERT (#78042)
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrSSE.td
llvm/test/CodeGen/X86/bfloat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 700ab797b2f69f7..e19128ec7756519 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21523,9 +21523,19 @@ static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
+
+ MVT SVT = Op.getOperand(0).getSimpleValueType();
+ if (SVT == MVT::f32 && (Subtarget.hasBF16() || Subtarget.hasAVXNECONVERT())) {
+ SDValue Res;
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
+ Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
+ Res = DAG.getBitcast(MVT::v8i16, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
MakeLibCallOptions CallOptions;
- RTLIB::Libcall LC =
- RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
+ RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
SDValue Res =
makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
return DAG.getBitcast(MVT::i16, Res);
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e8a1a2b83886f8b..a8cd1996eeb356b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8331,6 +8331,10 @@ let Predicates = [HasAVXNECONVERT] in {
f256mem>, T8;
defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8, XS, ExplicitVEXPrefix;
+ def : Pat<(v8bf16 (X86cvtneps2bf16 (v4f32 VR128X:$src))),
+ (VCVTNEPS2BF16rr VR128:$src)>;
+ def : Pat<(v8bf16 (X86cvtneps2bf16 (loadv4f32 addr:$src))),
+ (VCVTNEPS2BF16rm addr:$src)>;
def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
(VCVTNEPS2BF16Yrr VR256:$src)>;
def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index b309f47e4b7190f..9d2ef51b0a8fbe3 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -8,23 +8,18 @@
define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-LABEL: add:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl (%edx), %edx
+; X86-NEXT: shll $16, %edx
+; X86-NEXT: vmovd %edx, %xmm0
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: shll $16, %ecx
-; X86-NEXT: vmovd %ecx, %xmm0
-; X86-NEXT: movzwl (%eax), %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vmovd %ecx, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovsh %xmm0, (%esi)
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT: vpextrw $0, %xmm0, (%eax)
; X86-NEXT: retl
;
; SSE2-LABEL: add:
@@ -44,37 +39,31 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; BF16-LABEL: add:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rbx
-; BF16-NEXT: movq %rdx, %rbx
-; BF16-NEXT: movzwl (%rsi), %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: movzwl (%rdi), %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, (%rbx)
-; BF16-NEXT: popq %rbx
-; BF16-NEXT: retq
+; F16-LABEL: add:
+; F16: # %bb.0:
+; F16-NEXT: movzwl (%rsi), %eax
+; F16-NEXT: shll $16, %eax
+; F16-NEXT: vmovd %eax, %xmm0
+; F16-NEXT: movzwl (%rdi), %eax
+; F16-NEXT: shll $16, %eax
+; F16-NEXT: vmovd %eax, %xmm1
+; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; F16-NEXT: vpextrw $0, %xmm0, (%rdx)
+; F16-NEXT: retq
;
-; FP16-LABEL: add:
-; FP16: # %bb.0:
-; FP16-NEXT: pushq %rbx
-; FP16-NEXT: movq %rdx, %rbx
-; FP16-NEXT: movzwl (%rsi), %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: movzwl (%rdi), %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovsh %xmm0, (%rbx)
-; FP16-NEXT: popq %rbx
-; FP16-NEXT: retq
+; AVXNC-LABEL: add:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: movzwl (%rsi), %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: movzwl (%rdi), %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vpextrw $0, %xmm0, (%rdx)
+; AVXNC-NEXT: retq
%a = load bfloat, ptr %pa
%b = load bfloat, ptr %pb
%add = fadd bfloat %a, %b
@@ -85,7 +74,6 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; X86-LABEL: add2:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
@@ -93,9 +81,9 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT: vmovw %xmm0, %eax
+; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add2:
@@ -112,23 +100,8 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; BF16-LABEL: add2:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rax
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: vpextrw $0, %xmm1, %ecx
-; BF16-NEXT: shll $16, %ecx
-; BF16-NEXT: vmovd %ecx, %xmm0
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: popq %rax
-; BF16-NEXT: retq
-;
; FP16-LABEL: add2:
; FP16: # %bb.0:
-; FP16-NEXT: pushq %rax
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: vmovw %xmm1, %ecx
; FP16-NEXT: shll $16, %ecx
@@ -136,9 +109,24 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: popq %rax
+; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add2:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx
+; AVXNC-NEXT: shll $16, %ecx
+; AVXNC-NEXT: vmovd %ecx, %xmm0
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vmovd %xmm0, %eax
+; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNC-NEXT: retq
%add = fadd bfloat %a, %b
ret bfloat %add
}
@@ -166,8 +154,7 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-NEXT: shll $16, %edi
; X86-NEXT: vmovd %edi, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
@@ -208,35 +195,6 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; BF16-LABEL: add_double:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rbp
-; BF16-NEXT: pushq %r14
-; BF16-NEXT: pushq %rbx
-; BF16-NEXT: movq %rdx, %rbx
-; BF16-NEXT: movq %rsi, %r14
-; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; BF16-NEXT: callq __truncdfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %ebp
-; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; BF16-NEXT: callq __truncdfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: shll $16, %ebp
-; BF16-NEXT: vmovd %ebp, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; BF16-NEXT: vmovsd %xmm0, (%rbx)
-; BF16-NEXT: popq %rbx
-; BF16-NEXT: popq %r14
-; BF16-NEXT: popq %rbp
-; BF16-NEXT: retq
-;
; FP16-LABEL: add_double:
; FP16: # %bb.0:
; FP16-NEXT: pushq %rbp
@@ -255,7 +213,7 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; FP16-NEXT: shll $16, %ebp
; FP16-NEXT: vmovd %ebp, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
@@ -265,6 +223,35 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; FP16-NEXT: popq %r14
; FP16-NEXT: popq %rbp
; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_double:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rbp
+; AVXNC-NEXT: pushq %r14
+; AVXNC-NEXT: pushq %rbx
+; AVXNC-NEXT: movq %rdx, %rbx
+; AVXNC-NEXT: movq %rsi, %r14
+; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
+; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: shll $16, %ebp
+; AVXNC-NEXT: vmovd %ebp, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vmovd %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT: vmovsd %xmm0, (%rbx)
+; AVXNC-NEXT: popq %rbx
+; AVXNC-NEXT: popq %r14
+; AVXNC-NEXT: popq %rbp
+; AVXNC-NEXT: retq
%la = load double, ptr %pa
%a = fptrunc double %la to bfloat
%lb = load double, ptr %pb
@@ -293,8 +280,7 @@ define double @add_double2(double %da, double %db) nounwind {
; X86-NEXT: shll $16, %esi
; X86-NEXT: vmovd %esi, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
@@ -330,31 +316,6 @@ define double @add_double2(double %da, double %db) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; BF16-LABEL: add_double2:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rbx
-; BF16-NEXT: subq $16, %rsp
-; BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; BF16-NEXT: callq __truncdfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %ebx
-; BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; BF16-NEXT: # xmm0 = mem[0],zero
-; BF16-NEXT: callq __truncdfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: shll $16, %ebx
-; BF16-NEXT: vmovd %ebx, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; BF16-NEXT: addq $16, %rsp
-; BF16-NEXT: popq %rbx
-; BF16-NEXT: retq
-;
; FP16-LABEL: add_double2:
; FP16: # %bb.0:
; FP16-NEXT: pushq %rbx
@@ -371,7 +332,7 @@ define double @add_double2(double %da, double %db) nounwind {
; FP16-NEXT: shll $16, %ebx
; FP16-NEXT: vmovd %ebx, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
@@ -379,6 +340,31 @@ define double @add_double2(double %da, double %db) nounwind {
; FP16-NEXT: addq $16, %rsp
; FP16-NEXT: popq %rbx
; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_double2:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rbx
+; AVXNC-NEXT: subq $16, %rsp
+; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
+; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; AVXNC-NEXT: # xmm0 = mem[0],zero
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: shll $16, %ebx
+; AVXNC-NEXT: vmovd %ebx, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vmovd %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT: addq $16, %rsp
+; AVXNC-NEXT: popq %rbx
+; AVXNC-NEXT: retq
%a = fptrunc double %da to bfloat
%b = fptrunc double %db to bfloat
%add = fadd bfloat %a, %b
@@ -389,19 +375,14 @@ define double @add_double2(double %da, double %db) nounwind {
define void @add_constant(ptr %pa, ptr %pc) nounwind {
; X86-LABEL: add_constant:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %ecx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: vmovd %ecx, %xmm0
; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovsh %xmm0, (%esi)
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT: vpextrw $0, %xmm0, (%eax)
; X86-NEXT: retl
;
; SSE2-LABEL: add_constant:
@@ -418,31 +399,25 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; BF16-LABEL: add_constant:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rbx
-; BF16-NEXT: movq %rsi, %rbx
-; BF16-NEXT: movzwl (%rdi), %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, (%rbx)
-; BF16-NEXT: popq %rbx
-; BF16-NEXT: retq
+; F16-LABEL: add_constant:
+; F16: # %bb.0:
+; F16-NEXT: movzwl (%rdi), %eax
+; F16-NEXT: shll $16, %eax
+; F16-NEXT: vmovd %eax, %xmm0
+; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; F16-NEXT: vpextrw $0, %xmm0, (%rsi)
+; F16-NEXT: retq
;
-; FP16-LABEL: add_constant:
-; FP16: # %bb.0:
-; FP16-NEXT: pushq %rbx
-; FP16-NEXT: movq %rsi, %rbx
-; FP16-NEXT: movzwl (%rdi), %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovsh %xmm0, (%rbx)
-; FP16-NEXT: popq %rbx
-; FP16-NEXT: retq
+; AVXNC-LABEL: add_constant:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: movzwl (%rdi), %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVXNC-NEXT: retq
%a = load bfloat, ptr %pa
%add = fadd bfloat %a, 1.0
store bfloat %add, ptr %pc
@@ -452,14 +427,13 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
define bfloat @add_constant2(bfloat %a) nounwind {
; X86-LABEL: add_constant2:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT: vmovw %xmm0, %eax
+; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add_constant2:
@@ -473,27 +447,27 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; BF16-LABEL: add_constant2:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rax
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: popq %rax
-; BF16-NEXT: retq
-;
; FP16-LABEL: add_constant2:
; FP16: # %bb.0:
-; FP16-NEXT: pushq %rax
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: popq %rax
+; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_constant2:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vmovd %xmm0, %eax
+; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNC-NEXT: retq
%add = fadd bfloat %a, 1.0
ret bfloat %add
}
@@ -551,138 +525,101 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; X86-LABEL: addv:
; X86: # %bb.0:
-; X86-NEXT: subl $172, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
; X86-NEXT: vmovw %xmm1, %eax
-; X86-NEXT: vmovdqa %xmm1, %xmm3
-; X86-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm2
; X86-NEXT: vmovw %xmm0, %eax
-; X86-NEXT: vmovdqa %xmm0, %xmm4
-; X86-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vpextrw $1, %xmm3, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpextrw $1, %xmm4, %eax
; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpextrw $2, %xmm0, %eax
+; X86-NEXT: vmovd %eax, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; X86-NEXT: vmovw %xmm2, %ecx
+; X86-NEXT: vpextrw $1, %xmm1, %eax
; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpextrw $2, %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpextrw $3, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpextrw $3, %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpextrw $4, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpextrw $4, %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpextrw $5, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpextrw $5, %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpextrw $6, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpextrw $6, %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpextrw $7, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpextrw $7, %xmm1, %eax
+; X86-NEXT: vmovd %eax, %xmm2
+; X86-NEXT: vpextrw $1, %xmm0, %eax
; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vmovd %eax, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; X86-NEXT: vmovw %xmm2, %eax
+; X86-NEXT: vpextrw $2, %xmm1, %edx
+; X86-NEXT: shll $16, %edx
+; X86-NEXT: vmovd %edx, %xmm2
+; X86-NEXT: vpextrw $2, %xmm0, %edx
+; X86-NEXT: shll $16, %edx
+; X86-NEXT: vmovd %edx, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; X86-NEXT: vmovw %xmm2, %edx
+; X86-NEXT: vpextrw $3, %xmm1, %esi
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: vmovd %esi, %xmm2
+; X86-NEXT: vpextrw $3, %xmm0, %esi
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: vmovd %esi, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; X86-NEXT: vmovw %xmm2, %esi
+; X86-NEXT: vpextrw $4, %xmm1, %edi
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: vmovd %edi, %xmm2
+; X86-NEXT: vpextrw $4, %xmm0, %edi
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: vmovd %edi, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; X86-NEXT: vmovw %xmm2, %ebx
+; X86-NEXT: vpextrw $5, %xmm1, %edi
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: vmovd %edi, %xmm2
+; X86-NEXT: vpextrw $5, %xmm0, %edi
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: vmovd %edi, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; X86-NEXT: vmovw %xmm2, %edi
+; X86-NEXT: vpextrw $6, %xmm1, %ebp
+; X86-NEXT: shll $16, %ebp
+; X86-NEXT: vmovd %ebp, %xmm2
+; X86-NEXT: vpextrw $6, %xmm0, %ebp
+; X86-NEXT: shll $16, %ebp
+; X86-NEXT: vmovd %ebp, %xmm3
+; X86-NEXT: vaddss %xmm2, %xmm3, %xmm3
+; X86-NEXT: vmovw %ecx, %xmm2
+; X86-NEXT: vcvtneps2bf16 %xmm3, %xmm3
+; X86-NEXT: vmovw %xmm3, %ecx
+; X86-NEXT: vmovw %ebx, %xmm3
+; X86-NEXT: vpextrw $7, %xmm1, %ebx
+; X86-NEXT: shll $16, %ebx
+; X86-NEXT: vmovd %ebx, %xmm1
+; X86-NEXT: vpextrw $7, %xmm0, %ebx
+; X86-NEXT: shll $16, %ebx
+; X86-NEXT: vmovd %ebx, %xmm0
+; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmovw %ecx, %xmm1
+; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT: vmovw %xmm0, %ecx
+; X86-NEXT: vmovw %ecx, %xmm0
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT: vmovw %edi, %xmm1
+; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X86-NEXT: vmovw %edx, %xmm3
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; X86-NEXT: vmovw %esi, %xmm1
+; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X86-NEXT: vmovw %eax, %xmm3
+; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-NEXT: addl $172, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE2-LABEL: addv:
@@ -819,227 +756,177 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; BF16-LABEL: addv:
-; BF16: # %bb.0:
-; BF16-NEXT: pushq %rbp
-; BF16-NEXT: pushq %r15
-; BF16-NEXT: pushq %r14
-; BF16-NEXT: pushq %r13
-; BF16-NEXT: pushq %r12
-; BF16-NEXT: pushq %rbx
-; BF16-NEXT: subq $40, %rsp
-; BF16-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; BF16-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; BF16-NEXT: vpextrw $7, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm2
-; BF16-NEXT: vpextrw $7, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm2, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $6, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $6, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %ebp
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $5, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $5, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %r14d
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $4, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $4, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %r15d
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $3, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $3, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %r12d
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $2, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $2, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %r13d
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vmovd %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vmovd %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %ebx
-; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $1, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $1, %xmm1, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm1
-; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: vmovd %ebx, %xmm0
-; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; BF16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
-; BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
-; BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
-; BF16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
-; BF16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
-; BF16-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; BF16-NEXT: addq $40, %rsp
-; BF16-NEXT: popq %rbx
-; BF16-NEXT: popq %r12
-; BF16-NEXT: popq %r13
-; BF16-NEXT: popq %r14
-; BF16-NEXT: popq %r15
-; BF16-NEXT: popq %rbp
-; BF16-NEXT: retq
-;
; FP16-LABEL: addv:
; FP16: # %bb.0:
-; FP16-NEXT: subq $152, %rsp
-; FP16-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; FP16-NEXT: vmovw %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm2
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm2, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovd %eax, %xmm3
+; FP16-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; FP16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; FP16-NEXT: vmovw %xmm2, %eax
+; FP16-NEXT: vmovw %eax, %xmm2
+; FP16-NEXT: vpextrw $1, %xmm1, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm3
; FP16-NEXT: vpextrw $1, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $1, %xmm1, %eax
+; FP16-NEXT: vmovd %eax, %xmm4
+; FP16-NEXT: vaddss %xmm3, %xmm4, %xmm3
+; FP16-NEXT: vcvtneps2bf16 %xmm3, %xmm3
+; FP16-NEXT: vmovw %xmm3, %eax
+; FP16-NEXT: vmovw %eax, %xmm3
+; FP16-NEXT: vpextrw $2, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovd %eax, %xmm4
; FP16-NEXT: vpextrw $2, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $2, %xmm1, %eax
+; FP16-NEXT: vmovd %eax, %xmm5
+; FP16-NEXT: vaddss %xmm4, %xmm5, %xmm4
+; FP16-NEXT: vcvtneps2bf16 %xmm4, %xmm4
+; FP16-NEXT: vmovw %xmm4, %eax
+; FP16-NEXT: vmovw %eax, %xmm4
+; FP16-NEXT: vpextrw $3, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovd %eax, %xmm5
; FP16-NEXT: vpextrw $3, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $3, %xmm1, %eax
+; FP16-NEXT: vmovd %eax, %xmm6
+; FP16-NEXT: vaddss %xmm5, %xmm6, %xmm5
+; FP16-NEXT: vcvtneps2bf16 %xmm5, %xmm5
+; FP16-NEXT: vmovw %xmm5, %eax
+; FP16-NEXT: vmovw %eax, %xmm5
+; FP16-NEXT: vpextrw $4, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovd %eax, %xmm6
; FP16-NEXT: vpextrw $4, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $4, %xmm1, %eax
+; FP16-NEXT: vmovd %eax, %xmm7
+; FP16-NEXT: vaddss %xmm6, %xmm7, %xmm6
+; FP16-NEXT: vcvtneps2bf16 %xmm6, %xmm6
+; FP16-NEXT: vmovw %xmm6, %eax
+; FP16-NEXT: vmovw %eax, %xmm6
+; FP16-NEXT: vpextrw $5, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovd %eax, %xmm7
; FP16-NEXT: vpextrw $5, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $5, %xmm1, %eax
+; FP16-NEXT: vmovd %eax, %xmm8
+; FP16-NEXT: vaddss %xmm7, %xmm8, %xmm7
+; FP16-NEXT: vcvtneps2bf16 %xmm7, %xmm7
+; FP16-NEXT: vmovw %xmm7, %eax
+; FP16-NEXT: vmovw %eax, %xmm7
+; FP16-NEXT: vpextrw $6, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovd %eax, %xmm8
; FP16-NEXT: vpextrw $6, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $6, %xmm1, %eax
+; FP16-NEXT: vmovd %eax, %xmm9
+; FP16-NEXT: vaddss %xmm8, %xmm9, %xmm8
+; FP16-NEXT: vcvtneps2bf16 %xmm8, %xmm8
+; FP16-NEXT: vmovw %xmm8, %eax
+; FP16-NEXT: vmovw %eax, %xmm8
+; FP16-NEXT: vpextrw $7, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FP16-NEXT: vpextrw $7, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $7, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; FP16-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: vmovw %eax, %xmm0
+; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; FP16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; FP16-NEXT: addq $152, %rsp
; FP16-NEXT: retq
+;
+; AVXNC-LABEL: addv:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: vpextrw $7, %xmm1, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm2
+; AVXNC-NEXT: vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm2, %eax
+; AVXNC-NEXT: vpextrw $6, %xmm1, %ecx
+; AVXNC-NEXT: shll $16, %ecx
+; AVXNC-NEXT: vmovd %ecx, %xmm2
+; AVXNC-NEXT: vpextrw $6, %xmm0, %ecx
+; AVXNC-NEXT: shll $16, %ecx
+; AVXNC-NEXT: vmovd %ecx, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm2, %ecx
+; AVXNC-NEXT: vpextrw $5, %xmm1, %edx
+; AVXNC-NEXT: shll $16, %edx
+; AVXNC-NEXT: vmovd %edx, %xmm2
+; AVXNC-NEXT: vpextrw $5, %xmm0, %edx
+; AVXNC-NEXT: shll $16, %edx
+; AVXNC-NEXT: vmovd %edx, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm2, %edx
+; AVXNC-NEXT: vpextrw $4, %xmm1, %esi
+; AVXNC-NEXT: shll $16, %esi
+; AVXNC-NEXT: vmovd %esi, %xmm2
+; AVXNC-NEXT: vpextrw $4, %xmm0, %esi
+; AVXNC-NEXT: shll $16, %esi
+; AVXNC-NEXT: vmovd %esi, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm2, %esi
+; AVXNC-NEXT: vpextrw $3, %xmm1, %edi
+; AVXNC-NEXT: shll $16, %edi
+; AVXNC-NEXT: vmovd %edi, %xmm2
+; AVXNC-NEXT: vpextrw $3, %xmm0, %edi
+; AVXNC-NEXT: shll $16, %edi
+; AVXNC-NEXT: vmovd %edi, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm2, %edi
+; AVXNC-NEXT: vpextrw $2, %xmm1, %r8d
+; AVXNC-NEXT: shll $16, %r8d
+; AVXNC-NEXT: vmovd %r8d, %xmm2
+; AVXNC-NEXT: vpextrw $2, %xmm0, %r8d
+; AVXNC-NEXT: shll $16, %r8d
+; AVXNC-NEXT: vmovd %r8d, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm2, %r8d
+; AVXNC-NEXT: vpextrw $1, %xmm1, %r9d
+; AVXNC-NEXT: shll $16, %r9d
+; AVXNC-NEXT: vmovd %r9d, %xmm2
+; AVXNC-NEXT: vpextrw $1, %xmm0, %r9d
+; AVXNC-NEXT: shll $16, %r9d
+; AVXNC-NEXT: vmovd %r9d, %xmm3
+; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
+; AVXNC-NEXT: vmovd %xmm1, %r9d
+; AVXNC-NEXT: shll $16, %r9d
+; AVXNC-NEXT: vmovd %r9d, %xmm1
+; AVXNC-NEXT: vmovd %xmm0, %r9d
+; AVXNC-NEXT: shll $16, %r9d
+; AVXNC-NEXT: vmovd %r9d, %xmm0
+; AVXNC-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVXNC-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVXNC-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
+; AVXNC-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
+; AVXNC-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
+; AVXNC-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVXNC-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVXNC-NEXT: retq
%add = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %add
}
More information about the llvm-commits
mailing list