[llvm] 86df55e - [X86][BF16] Promote vector ADD/SUB/MUL/DIV to f32 (#87858)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 11 18:21:21 PDT 2024
Author: Phoebe Wang
Date: 2024-04-12T09:21:17+08:00
New Revision: 86df55efa1242e308535679cf05c2793ff38f3d9
URL: https://github.com/llvm/llvm-project/commit/86df55efa1242e308535679cf05c2793ff38f3d9
DIFF: https://github.com/llvm/llvm-project/commit/86df55efa1242e308535679cf05c2793ff38f3d9.diff
LOG: [X86][BF16] Promote vector ADD/SUB/MUL/DIV to f32 (#87858)
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/bfloat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f274da6f6f7767..b7cb4b7dafeb69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1978,10 +1978,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
- for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
- setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
- }
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
@@ -2296,15 +2294,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
setF16Action(VT, Expand);
- setOperationAction(ISD::FADD, VT, Expand);
- setOperationAction(ISD::FSUB, VT, Expand);
- setOperationAction(ISD::FMUL, VT, Expand);
- setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+ setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
+ setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
+ }
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
}
@@ -2312,10 +2310,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
setF16Action(MVT::v32bf16, Expand);
- setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
- setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
- setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
- setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
+ setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 6e3020d628edfc..8a2109a1c78df9 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -525,101 +525,13 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; X86-LABEL: addv:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: vmovw %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm2
-; X86-NEXT: vmovw %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; X86-NEXT: vmovw %xmm2, %ecx
-; X86-NEXT: vpextrw $1, %xmm1, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm2
-; X86-NEXT: vpextrw $1, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; X86-NEXT: vmovw %xmm2, %eax
-; X86-NEXT: vpextrw $2, %xmm1, %edx
-; X86-NEXT: shll $16, %edx
-; X86-NEXT: vmovd %edx, %xmm2
-; X86-NEXT: vpextrw $2, %xmm0, %edx
-; X86-NEXT: shll $16, %edx
-; X86-NEXT: vmovd %edx, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; X86-NEXT: vmovw %xmm2, %edx
-; X86-NEXT: vpextrw $3, %xmm1, %esi
-; X86-NEXT: shll $16, %esi
-; X86-NEXT: vmovd %esi, %xmm2
-; X86-NEXT: vpextrw $3, %xmm0, %esi
-; X86-NEXT: shll $16, %esi
-; X86-NEXT: vmovd %esi, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; X86-NEXT: vmovw %xmm2, %esi
-; X86-NEXT: vpextrw $4, %xmm1, %edi
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm2
-; X86-NEXT: vpextrw $4, %xmm0, %edi
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; X86-NEXT: vmovw %xmm2, %ebx
-; X86-NEXT: vpextrw $5, %xmm1, %edi
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm2
-; X86-NEXT: vpextrw $5, %xmm0, %edi
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; X86-NEXT: vmovw %xmm2, %edi
-; X86-NEXT: vpextrw $6, %xmm1, %ebp
-; X86-NEXT: shll $16, %ebp
-; X86-NEXT: vmovd %ebp, %xmm2
-; X86-NEXT: vpextrw $6, %xmm0, %ebp
-; X86-NEXT: shll $16, %ebp
-; X86-NEXT: vmovd %ebp, %xmm3
-; X86-NEXT: vaddss %xmm2, %xmm3, %xmm3
-; X86-NEXT: vmovw %ecx, %xmm2
-; X86-NEXT: vcvtneps2bf16 %xmm3, %xmm3
-; X86-NEXT: vmovw %xmm3, %ecx
-; X86-NEXT: vmovw %ebx, %xmm3
-; X86-NEXT: vpextrw $7, %xmm1, %ebx
-; X86-NEXT: shll $16, %ebx
-; X86-NEXT: vmovd %ebx, %xmm1
-; X86-NEXT: vpextrw $7, %xmm0, %ebx
-; X86-NEXT: shll $16, %ebx
-; X86-NEXT: vmovd %ebx, %xmm0
-; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; X86-NEXT: vmovw %ecx, %xmm1
-; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; X86-NEXT: vmovw %xmm0, %ecx
-; X86-NEXT: vmovw %ecx, %xmm0
-; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-NEXT: vmovw %edi, %xmm1
-; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X86-NEXT: vmovw %edx, %xmm3
-; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: vmovw %esi, %xmm1
-; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X86-NEXT: vmovw %eax, %xmm3
-; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X86-NEXT: vpslld $16, %ymm1, %ymm1
+; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: vpslld $16, %ymm0, %ymm0
+; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: addv:
@@ -756,176 +668,26 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; FP16-LABEL: addv:
-; FP16: # %bb.0:
-; FP16-NEXT: vmovw %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm2
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm3
-; FP16-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; FP16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
-; FP16-NEXT: vmovw %xmm2, %eax
-; FP16-NEXT: vmovw %eax, %xmm2
-; FP16-NEXT: vpextrw $1, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm3
-; FP16-NEXT: vpextrw $1, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm4
-; FP16-NEXT: vaddss %xmm3, %xmm4, %xmm3
-; FP16-NEXT: vcvtneps2bf16 %xmm3, %xmm3
-; FP16-NEXT: vmovw %xmm3, %eax
-; FP16-NEXT: vmovw %eax, %xmm3
-; FP16-NEXT: vpextrw $2, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm4
-; FP16-NEXT: vpextrw $2, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm5
-; FP16-NEXT: vaddss %xmm4, %xmm5, %xmm4
-; FP16-NEXT: vcvtneps2bf16 %xmm4, %xmm4
-; FP16-NEXT: vmovw %xmm4, %eax
-; FP16-NEXT: vmovw %eax, %xmm4
-; FP16-NEXT: vpextrw $3, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm5
-; FP16-NEXT: vpextrw $3, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm6
-; FP16-NEXT: vaddss %xmm5, %xmm6, %xmm5
-; FP16-NEXT: vcvtneps2bf16 %xmm5, %xmm5
-; FP16-NEXT: vmovw %xmm5, %eax
-; FP16-NEXT: vmovw %eax, %xmm5
-; FP16-NEXT: vpextrw $4, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm6
-; FP16-NEXT: vpextrw $4, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm7
-; FP16-NEXT: vaddss %xmm6, %xmm7, %xmm6
-; FP16-NEXT: vcvtneps2bf16 %xmm6, %xmm6
-; FP16-NEXT: vmovw %xmm6, %eax
-; FP16-NEXT: vmovw %eax, %xmm6
-; FP16-NEXT: vpextrw $5, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm7
-; FP16-NEXT: vpextrw $5, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm8
-; FP16-NEXT: vaddss %xmm7, %xmm8, %xmm7
-; FP16-NEXT: vcvtneps2bf16 %xmm7, %xmm7
-; FP16-NEXT: vmovw %xmm7, %eax
-; FP16-NEXT: vmovw %eax, %xmm7
-; FP16-NEXT: vpextrw $6, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm8
-; FP16-NEXT: vpextrw $6, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm9
-; FP16-NEXT: vaddss %xmm8, %xmm9, %xmm8
-; FP16-NEXT: vcvtneps2bf16 %xmm8, %xmm8
-; FP16-NEXT: vmovw %xmm8, %eax
-; FP16-NEXT: vmovw %eax, %xmm8
-; FP16-NEXT: vpextrw $7, %xmm1, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vpextrw $7, %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: vmovw %eax, %xmm0
-; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; FP16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; FP16-NEXT: retq
+; F16-LABEL: addv:
+; F16: # %bb.0:
+; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; F16-NEXT: vpslld $16, %ymm1, %ymm1
+; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; F16-NEXT: vpslld $16, %ymm0, %ymm0
+; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; F16-NEXT: vzeroupper
+; F16-NEXT: retq
;
; AVXNC-LABEL: addv:
; AVXNC: # %bb.0:
-; AVXNC-NEXT: vpextrw $7, %xmm1, %eax
-; AVXNC-NEXT: shll $16, %eax
-; AVXNC-NEXT: vmovd %eax, %xmm2
-; AVXNC-NEXT: vpextrw $7, %xmm0, %eax
-; AVXNC-NEXT: shll $16, %eax
-; AVXNC-NEXT: vmovd %eax, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm2, %eax
-; AVXNC-NEXT: vpextrw $6, %xmm1, %ecx
-; AVXNC-NEXT: shll $16, %ecx
-; AVXNC-NEXT: vmovd %ecx, %xmm2
-; AVXNC-NEXT: vpextrw $6, %xmm0, %ecx
-; AVXNC-NEXT: shll $16, %ecx
-; AVXNC-NEXT: vmovd %ecx, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm2, %ecx
-; AVXNC-NEXT: vpextrw $5, %xmm1, %edx
-; AVXNC-NEXT: shll $16, %edx
-; AVXNC-NEXT: vmovd %edx, %xmm2
-; AVXNC-NEXT: vpextrw $5, %xmm0, %edx
-; AVXNC-NEXT: shll $16, %edx
-; AVXNC-NEXT: vmovd %edx, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm2, %edx
-; AVXNC-NEXT: vpextrw $4, %xmm1, %esi
-; AVXNC-NEXT: shll $16, %esi
-; AVXNC-NEXT: vmovd %esi, %xmm2
-; AVXNC-NEXT: vpextrw $4, %xmm0, %esi
-; AVXNC-NEXT: shll $16, %esi
-; AVXNC-NEXT: vmovd %esi, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm2, %esi
-; AVXNC-NEXT: vpextrw $3, %xmm1, %edi
-; AVXNC-NEXT: shll $16, %edi
-; AVXNC-NEXT: vmovd %edi, %xmm2
-; AVXNC-NEXT: vpextrw $3, %xmm0, %edi
-; AVXNC-NEXT: shll $16, %edi
-; AVXNC-NEXT: vmovd %edi, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm2, %edi
-; AVXNC-NEXT: vpextrw $2, %xmm1, %r8d
-; AVXNC-NEXT: shll $16, %r8d
-; AVXNC-NEXT: vmovd %r8d, %xmm2
-; AVXNC-NEXT: vpextrw $2, %xmm0, %r8d
-; AVXNC-NEXT: shll $16, %r8d
-; AVXNC-NEXT: vmovd %r8d, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm2, %r8d
-; AVXNC-NEXT: vpextrw $1, %xmm1, %r9d
-; AVXNC-NEXT: shll $16, %r9d
-; AVXNC-NEXT: vmovd %r9d, %xmm2
-; AVXNC-NEXT: vpextrw $1, %xmm0, %r9d
-; AVXNC-NEXT: shll $16, %r9d
-; AVXNC-NEXT: vmovd %r9d, %xmm3
-; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
-; AVXNC-NEXT: vmovd %xmm1, %r9d
-; AVXNC-NEXT: shll $16, %r9d
-; AVXNC-NEXT: vmovd %r9d, %xmm1
-; AVXNC-NEXT: vmovd %xmm0, %r9d
-; AVXNC-NEXT: shll $16, %r9d
-; AVXNC-NEXT: vmovd %r9d, %xmm0
-; AVXNC-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
-; AVXNC-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVXNC-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
+; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
+; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: retq
%add = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %add
More information about the llvm-commits
mailing list