[flang-commits] [libcxx] [flang] [libc] [clang-tools-extra] [llvm] [lldb] [clang] [compiler-rt] [X86][BF16][WIP] Try to use `f16` for lowering (PR #76901)
Phoebe Wang via flang-commits
flang-commits at lists.llvm.org
Wed Jan 3 22:05:06 PST 2024
https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/76901
>From ff9b72bdb5442a037d4325619de66e25ad211586 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Mon, 1 Jan 2024 15:13:38 +0800
Subject: [PATCH] [X86][BF16][WIP] Try to use `f16` for lowering
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +-
llvm/lib/Target/X86/X86ISelLowering.h | 10 -
llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 37 +-
llvm/test/CodeGen/X86/bfloat.ll | 1199 +++++++++++--------
4 files changed, 739 insertions(+), 518 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a90ddf132c3897..6d25992315b2db 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7475,10 +7475,12 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
- MVT IVT = VT.changeVectorElementTypeToInteger();
+ MVT IVT =
+ VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
SmallVector<SDValue, 16> NewOps;
for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
- NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
+ NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
+ Op.getOperand(I)));
SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
return DAG.getBitcast(VT, Res);
}
@@ -21515,9 +21517,8 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
RTLIB::Libcall LC =
RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
SDValue Res =
- makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
- DAG.getBitcast(MVT::i32, Res));
+ makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
+ return DAG.getBitcast(MVT::i16, Res);
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 9bd1622cb0d3a6..32745400a38b7e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1714,16 +1714,6 @@ namespace llvm {
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
- bool splitValueIntoRegisterParts(
- SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC)
- const override;
-
- SDValue joinRegisterPartsIntoValue(
- SelectionDAG & DAG, const SDLoc &DL, const SDValue *Parts,
- unsigned NumParts, MVT PartVT, EVT ValueVT,
- std::optional<CallingConv::ID> CC) const override;
-
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b8b5421b900501..d75bd4171fde9d 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -127,6 +127,9 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return getRegisterTypeForCallingConv(Context, CC,
VT.changeVectorElementType(MVT::f16));
+ if (VT == MVT::bf16)
+ return MVT::f16;
+
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -421,40 +424,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
return TargetLowering::getJumpTableEncoding();
}
-bool X86TargetLowering::splitValueIntoRegisterParts(
- SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
- bool IsABIRegCopy = CC.has_value();
- EVT ValueVT = Val.getValueType();
- if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
- unsigned ValueBits = ValueVT.getSizeInBits();
- unsigned PartBits = PartVT.getSizeInBits();
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
- Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
- Parts[0] = Val;
- return true;
- }
- return false;
-}
-
-SDValue X86TargetLowering::joinRegisterPartsIntoValue(
- SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
- MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
- bool IsABIRegCopy = CC.has_value();
- if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
- unsigned ValueBits = ValueVT.getSizeInBits();
- unsigned PartBits = PartVT.getSizeInBits();
- SDValue Val = Parts[0];
-
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
- Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
- Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
- return Val;
- }
- return SDValue();
-}
-
bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 9c65310f79d7ec..30b99f32e1b64d 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -17,27 +17,58 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; AVX-LABEL: add:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: movzwl (%rsi), %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movw %ax, (%rbx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; BF16-LABEL: add:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: movq %rdx, %rbx
+; BF16-NEXT: movzwl (%rsi), %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: movzwl (%rdi), %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, (%rbx)
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: movq %rdx, %rbx
+; FP16-NEXT: movzwl (%rsi), %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: movzwl (%rdi), %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vmovsh %xmm0, (%rbx)
+; FP16-NEXT: popq %rbx
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rbx
+; AVXNC-NEXT: movq %rdx, %rbx
+; AVXNC-NEXT: movzwl (%rsi), %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: movzwl (%rdi), %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: callq __truncsfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVXNC-NEXT: popq %rbx
+; AVXNC-NEXT: retq
%a = load bfloat, ptr %pa
%b = load bfloat, ptr %pb
%add = fadd bfloat %a, %b
@@ -49,8 +80,8 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; SSE2-LABEL: add2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: shll $16, %eax
@@ -60,19 +91,47 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; AVX-LABEL: add2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rax
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vmovd %xmm1, %ecx
-; AVX-NEXT: shll $16, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm0
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: popq %rax
-; AVX-NEXT: retq
+; BF16-LABEL: add2:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rax
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: vpextrw $0, %xmm1, %ecx
+; BF16-NEXT: shll $16, %ecx
+; BF16-NEXT: vmovd %ecx, %xmm0
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: popq %rax
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add2:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rax
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: vmovw %xmm1, %ecx
+; FP16-NEXT: shll $16, %ecx
+; FP16-NEXT: vmovd %ecx, %xmm0
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: popq %rax
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add2:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rax
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx
+; AVXNC-NEXT: shll $16, %ecx
+; AVXNC-NEXT: vmovd %ecx, %xmm0
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: callq __truncsfbf2 at PLT
+; AVXNC-NEXT: popq %rax
+; AVXNC-NEXT: retq
%add = fadd bfloat %a, %b
ret bfloat %add
}
@@ -87,17 +146,17 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: movq %rsi, %r14
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movd %ebp, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
@@ -107,34 +166,92 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; AVX-LABEL: add_double:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbp
-; AVX-NEXT: pushq %r14
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: movq %rsi, %r14
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %ebp
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: shll $16, %ebp
-; AVX-NEXT: vmovd %ebp, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovsd %xmm0, (%rbx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: popq %r14
-; AVX-NEXT: popq %rbp
-; AVX-NEXT: retq
+; BF16-LABEL: add_double:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbp
+; BF16-NEXT: pushq %r14
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: movq %rdx, %rbx
+; BF16-NEXT: movq %rsi, %r14
+; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %ebp
+; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: shll $16, %ebp
+; BF16-NEXT: vmovd %ebp, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BF16-NEXT: vmovsd %xmm0, (%rbx)
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: popq %r14
+; BF16-NEXT: popq %rbp
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add_double:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbp
+; FP16-NEXT: pushq %r14
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: movq %rdx, %rbx
+; FP16-NEXT: movq %rsi, %r14
+; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %ebp
+; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: shll $16, %ebp
+; FP16-NEXT: vmovd %ebp, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; FP16-NEXT: vmovsd %xmm0, (%rbx)
+; FP16-NEXT: popq %rbx
+; FP16-NEXT: popq %r14
+; FP16-NEXT: popq %rbp
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_double:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rbp
+; AVXNC-NEXT: pushq %r14
+; AVXNC-NEXT: pushq %rbx
+; AVXNC-NEXT: movq %rdx, %rbx
+; AVXNC-NEXT: movq %rsi, %r14
+; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
+; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: shll $16, %ebp
+; AVXNC-NEXT: vmovd %ebp, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: callq __truncsfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT: vmovsd %xmm0, (%rbx)
+; AVXNC-NEXT: popq %rbx
+; AVXNC-NEXT: popq %r14
+; AVXNC-NEXT: popq %rbp
+; AVXNC-NEXT: retq
%la = load double, ptr %pa
%a = fptrunc double %la to bfloat
%lb = load double, ptr %pb
@@ -152,18 +269,18 @@ define double @add_double2(double %da, double %db) nounwind {
; SSE2-NEXT: subq $16, %rsp
; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
@@ -171,30 +288,80 @@ define double @add_double2(double %da, double %db) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; AVX-LABEL: add_double2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: subq $16, %rsp
-; AVX-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %ebx
-; AVX-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0],zero
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: shll $16, %ebx
-; AVX-NEXT: vmovd %ebx, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: addq $16, %rsp
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; BF16-LABEL: add_double2:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: subq $16, %rsp
+; BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %ebx
+; BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; BF16-NEXT: # xmm0 = mem[0],zero
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: shll $16, %ebx
+; BF16-NEXT: vmovd %ebx, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BF16-NEXT: addq $16, %rsp
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add_double2:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: subq $16, %rsp
+; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %ebx
+; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; FP16-NEXT: # xmm0 = mem[0],zero
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: shll $16, %ebx
+; FP16-NEXT: vmovd %ebx, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; FP16-NEXT: addq $16, %rsp
+; FP16-NEXT: popq %rbx
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_double2:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rbx
+; AVXNC-NEXT: subq $16, %rsp
+; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
+; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; AVXNC-NEXT: # xmm0 = mem[0],zero
+; AVXNC-NEXT: callq __truncdfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: shll $16, %ebx
+; AVXNC-NEXT: vmovd %ebx, %xmm1
+; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT: callq __truncsfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT: addq $16, %rsp
+; AVXNC-NEXT: popq %rbx
+; AVXNC-NEXT: retq
%a = fptrunc double %da to bfloat
%b = fptrunc double %db to bfloat
%add = fadd bfloat %a, %b
@@ -212,24 +379,49 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; AVX-LABEL: add_constant:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rsi, %rbx
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movw %ax, (%rbx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; BF16-LABEL: add_constant:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: movq %rsi, %rbx
+; BF16-NEXT: movzwl (%rdi), %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, (%rbx)
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add_constant:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: movq %rsi, %rbx
+; FP16-NEXT: movzwl (%rdi), %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vmovsh %xmm0, (%rbx)
+; FP16-NEXT: popq %rbx
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_constant:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rbx
+; AVXNC-NEXT: movq %rsi, %rbx
+; AVXNC-NEXT: movzwl (%rdi), %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVXNC-NEXT: callq __truncsfbf2 at PLT
+; AVXNC-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVXNC-NEXT: popq %rbx
+; AVXNC-NEXT: retq
%a = load bfloat, ptr %pa
%add = fadd bfloat %a, 1.0
store bfloat %add, ptr %pc
@@ -240,7 +432,7 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; SSE2-LABEL: add_constant2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -248,16 +440,38 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; AVX-LABEL: add_constant2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rax
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: popq %rax
-; AVX-NEXT: retq
+; BF16-LABEL: add_constant2:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rax
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: popq %rax
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add_constant2:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rax
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: popq %rax
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: add_constant2:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: pushq %rax
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVXNC-NEXT: callq __truncsfbf2 at PLT
+; AVXNC-NEXT: popq %rax
+; AVXNC-NEXT: retq
%add = fadd bfloat %a, 1.0
ret bfloat %add
}
@@ -306,93 +520,92 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: shrq $48, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %xmm1, %rdx
; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rdx, %rax
-; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: shrq $48, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: shrq $48, %rax
+; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movq %rdx, %rax
-; SSE2-NEXT: shrq $48, %rax
+; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT: movq %xmm0, %r12
-; SSE2-NEXT: movq %r12, %rax
-; SSE2-NEXT: shrq $32, %rax
-; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; SSE2-NEXT: movq %xmm0, %r15
+; SSE2-NEXT: movq %r15, %rbx
+; SSE2-NEXT: shrq $48, %rbx
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
; SSE2-NEXT: movq %xmm1, %r14
; SSE2-NEXT: movq %r14, %rbp
-; SSE2-NEXT: shrq $32, %rbp
-; SSE2-NEXT: movq %r12, %r15
-; SSE2-NEXT: shrq $48, %r15
+; SSE2-NEXT: shrq $48, %rbp
+; SSE2-NEXT: movq %r15, %r12
+; SSE2-NEXT: shrq $32, %r12
; SSE2-NEXT: movq %r14, %r13
-; SSE2-NEXT: shrq $48, %r13
+; SSE2-NEXT: shrq $32, %r13
; SSE2-NEXT: movl %r14d, %eax
-; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movl %r12d, %eax
-; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; SSE2-NEXT: movl %r15d, %eax
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
-; SSE2-NEXT: shll $16, %ebx
-; SSE2-NEXT: shll $16, %r14d
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000
; SSE2-NEXT: movd %r14d, %xmm1
-; SSE2-NEXT: shll $16, %r12d
-; SSE2-NEXT: movd %r12d, %xmm0
+; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000
+; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movzwl %ax, %r12d
-; SSE2-NEXT: orl %ebx, %r12d
+; SSE2-NEXT: pextrw $0, %xmm0, %r15d
+; SSE2-NEXT: shll $16, %r15d
+; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
; SSE2-NEXT: shll $16, %r13d
; SSE2-NEXT: movd %r13d, %xmm1
-; SSE2-NEXT: shll $16, %r15d
-; SSE2-NEXT: movd %r15d, %xmm0
+; SSE2-NEXT: shll $16, %r12d
+; SSE2-NEXT: movd %r12d, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %r14d
-; SSE2-NEXT: shll $16, %r14d
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movd %ebp, %xmm1
-; SSE2-NEXT: movq (%rsp), %rax # 8-byte Reload
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movzwl %ax, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: orl %r14d, %ebx
; SSE2-NEXT: shlq $32, %rbx
-; SSE2-NEXT: orq %r12, %rbx
+; SSE2-NEXT: orq %r15, %rbx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; SSE2-NEXT: movl %r15d, %eax
-; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; SSE2-NEXT: movl %r14d, %eax
-; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
-; SSE2-NEXT: shll $16, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %ebp
; SSE2-NEXT: movq %r15, %rax
-; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: pextrw $0, %xmm0, %r14d
+; SSE2-NEXT: shll $16, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
@@ -402,8 +615,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
-; SSE2-NEXT: shll $16, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %ebp
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
@@ -412,8 +625,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
; SSE2-NEXT: orq %r14, %rax
@@ -448,7 +661,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm2, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; BF16-NEXT: vpextrw $6, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
@@ -459,7 +673,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %ebp
+; BF16-NEXT: vpextrw $0, %xmm0, %ebp
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; BF16-NEXT: vpextrw $5, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
@@ -470,7 +684,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %r14d
+; BF16-NEXT: vpextrw $0, %xmm0, %r14d
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; BF16-NEXT: vpextrw $4, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
@@ -481,7 +695,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %r15d
+; BF16-NEXT: vpextrw $0, %xmm0, %r15d
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; BF16-NEXT: vpextrw $3, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
@@ -492,7 +706,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %r12d
+; BF16-NEXT: vpextrw $0, %xmm0, %r12d
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; BF16-NEXT: vpextrw $2, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
@@ -503,31 +717,31 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %r13d
+; BF16-NEXT: vpextrw $0, %xmm0, %r13d
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vpextrw $1, %xmm0, %eax
+; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vpextrw $1, %xmm1, %eax
+; BF16-NEXT: vmovd %xmm1, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %ebx
+; BF16-NEXT: vpextrw $0, %xmm0, %ebx
; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; BF16-NEXT: vmovd %xmm0, %eax
+; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; BF16-NEXT: vmovd %xmm1, %eax
+; BF16-NEXT: vpextrw $1, %xmm1, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BF16-NEXT: callq __truncsfbf2 at PLT
-; BF16-NEXT: vmovd %xmm0, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: vmovd %ebx, %xmm0
+; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
@@ -545,13 +759,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
;
; FP16-LABEL: addv:
; FP16: # %bb.0:
-; FP16-NEXT: pushq %rbp
-; FP16-NEXT: pushq %r15
-; FP16-NEXT: pushq %r14
-; FP16-NEXT: pushq %r13
-; FP16-NEXT: pushq %r12
-; FP16-NEXT: pushq %rbx
-; FP16-NEXT: subq $40, %rsp
+; FP16-NEXT: subq $152, %rsp
; FP16-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; FP16-NEXT: vmovw %xmm1, %eax
@@ -562,40 +770,40 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm2, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: vpextrw $7, %xmm0, %eax
+; FP16-NEXT: vpextrw $1, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $7, %xmm1, %eax
+; FP16-NEXT: vpextrw $1, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %ebp
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: vpextrw $6, %xmm0, %eax
+; FP16-NEXT: vpextrw $2, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $6, %xmm1, %eax
+; FP16-NEXT: vpextrw $2, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %r14d
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: vpextrw $5, %xmm0, %eax
+; FP16-NEXT: vpextrw $3, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $5, %xmm1, %eax
+; FP16-NEXT: vpextrw $3, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %r15d
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FP16-NEXT: vpextrw $4, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
@@ -606,56 +814,54 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %r12d
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: vpextrw $3, %xmm0, %eax
+; FP16-NEXT: vpextrw $5, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $3, %xmm1, %eax
+; FP16-NEXT: vpextrw $5, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %r13d
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: vpextrw $2, %xmm0, %eax
+; FP16-NEXT: vpextrw $6, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $2, %xmm1, %eax
+; FP16-NEXT: vpextrw $6, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %ebx
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: vpextrw $1, %xmm0, %eax
+; FP16-NEXT: vpextrw $7, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpextrw $1, %xmm1, %eax
+; FP16-NEXT: vpextrw $7, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: callq __truncsfbf2 at PLT
-; FP16-NEXT: vmovd %xmm0, %eax
-; FP16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; FP16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; FP16-NEXT: vpinsrw $2, %ebx, %xmm0, %xmm0
-; FP16-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0
-; FP16-NEXT: vpinsrw $4, %r12d, %xmm0, %xmm0
-; FP16-NEXT: vpinsrw $5, %r15d, %xmm0, %xmm0
-; FP16-NEXT: vpinsrw $6, %r14d, %xmm0, %xmm0
-; FP16-NEXT: vpinsrw $7, %ebp, %xmm0, %xmm0
-; FP16-NEXT: addq $40, %rsp
-; FP16-NEXT: popq %rbx
-; FP16-NEXT: popq %r12
-; FP16-NEXT: popq %r13
-; FP16-NEXT: popq %r14
-; FP16-NEXT: popq %r15
-; FP16-NEXT: popq %rbp
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; FP16-NEXT: addq $152, %rsp
; FP16-NEXT: retq
;
; AVXNC-LABEL: addv:
@@ -677,7 +883,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm2, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $6, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
@@ -688,7 +895,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %ebp
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $5, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
@@ -699,7 +906,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %r14d
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $4, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
@@ -710,7 +917,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %r15d
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $3, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
@@ -721,7 +928,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %r12d
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $2, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
@@ -732,31 +939,31 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %r13d
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVXNC-NEXT: vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT: vmovd %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVXNC-NEXT: vpextrw $1, %xmm1, %eax
+; AVXNC-NEXT: vmovd %xmm1, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %ebx
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVXNC-NEXT: vmovd %xmm0, %eax
+; AVXNC-NEXT: vpextrw $1, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVXNC-NEXT: vmovd %xmm1, %eax
+; AVXNC-NEXT: vpextrw $1, %xmm1, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: callq __truncsfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %eax
-; AVXNC-NEXT: vmovd %eax, %xmm0
-; AVXNC-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: vmovd %ebx, %xmm0
+; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
@@ -778,20 +985,29 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
; SSE2-LABEL: pr62997:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movd %xmm1, %ecx
-; SSE2-NEXT: pinsrw $0, %ecx, %xmm1
-; SSE2-NEXT: pinsrw $0, %eax, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
-; AVX-LABEL: pr62997:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm0
-; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; BF16-LABEL: pr62997:
+; BF16: # %bb.0:
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: vpextrw $0, %xmm1, %ecx
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; BF16-NEXT: retq
+;
+; FP16-LABEL: pr62997:
+; FP16: # %bb.0:
+; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; FP16-NEXT: retq
+;
+; AVXNC-LABEL: pr62997:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVXNC-NEXT: retq
%1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
%2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
ret <2 x bfloat> %2
@@ -1149,8 +1365,8 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
@@ -1166,23 +1382,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: .LBB12_67: # %else92
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1192,23 +1408,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1220,23 +1436,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1246,23 +1462,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1274,23 +1490,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1300,23 +1516,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1328,23 +1544,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1354,23 +1570,23 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebx, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -1386,11 +1602,17 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: popq %r14
; SSE2-NEXT: retq
;
-; F16-LABEL: pr63017_2:
-; F16: # %bb.0:
-; F16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; F16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
-; F16-NEXT: retq
+; BF16-LABEL: pr63017_2:
+; BF16: # %bb.0:
+; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
+; BF16-NEXT: retq
+;
+; FP16-LABEL: pr63017_2:
+; FP16: # %bb.0:
+; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
+; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
+; FP16-NEXT: retq
;
; AVXNC-LABEL: pr63017_2:
; AVXNC: # %bb.0:
@@ -1650,7 +1872,7 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; SSE2-NEXT: andq %rax, %rcx
; SSE2-NEXT: movzwl %ax, %eax
-; SSE2-NEXT: movd %xmm4, %edx
+; SSE2-NEXT: pextrw $0, %xmm4, %edx
; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %eax, %edx
; SSE2-NEXT: orq %rcx, %rdx
@@ -1658,16 +1880,23 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: retq
;
-; F16-LABEL: pr62997_3:
-; F16: # %bb.0:
-; F16-NEXT: vmovd %xmm1, %eax
-; F16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
-; F16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; F16-NEXT: retq
+; BF16-LABEL: pr62997_3:
+; BF16: # %bb.0:
+; BF16-NEXT: vpextrw $0, %xmm1, %eax
+; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
+; BF16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; BF16-NEXT: retq
+;
+; FP16-LABEL: pr62997_3:
+; FP16: # %bb.0:
+; FP16-NEXT: vmovw %xmm1, %eax
+; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
+; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; FP16-NEXT: retq
;
; AVXNC-LABEL: pr62997_3:
; AVXNC: # %bb.0:
-; AVXNC-NEXT: vmovd %xmm2, %eax
+; AVXNC-NEXT: vpextrw $0, %xmm2, %eax
; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2
; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVXNC-NEXT: retq
@@ -1953,42 +2182,28 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
; SSE2-LABEL: fptrunc_v4f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pushq %rbp
-; SSE2-NEXT: pushq %r14
-; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: subq $32, %rsp
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: subq $72, %rsp
+; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
-; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd %xmm0, %ebp
-; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd %xmm0, %r14d
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: pinsrw $0, %eax, %xmm0
-; SSE2-NEXT: pinsrw $0, %r14d, %xmm1
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pinsrw $0, %ebp, %xmm0
-; SSE2-NEXT: pinsrw $0, %ebx, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: addq $32, %rsp
-; SSE2-NEXT: popq %rbx
-; SSE2-NEXT: popq %r14
-; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: addq $72, %rsp
; SSE2-NEXT: retq
;
; F16-LABEL: fptrunc_v4f32:
@@ -2019,22 +2234,22 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebx
; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: shlq $32, %rbx
@@ -2042,22 +2257,22 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -2102,22 +2317,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebx
; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: shlq $32, %rbx
@@ -2125,22 +2340,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r15d
; SSE2-NEXT: orl %ebp, %r15d
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: shlq $32, %r14
@@ -2148,22 +2363,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r12d
; SSE2-NEXT: orl %ebp, %r12d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r15d
; SSE2-NEXT: orl %ebp, %r15d
; SSE2-NEXT: shlq $32, %r15
@@ -2171,22 +2386,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r12d
; SSE2-NEXT: orl %ebp, %r12d
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -2234,21 +2449,21 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebx, %r14d
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %ebx
; SSE2-NEXT: orl %ebp, %ebx
; SSE2-NEXT: shlq $32, %rbx
@@ -2256,21 +2471,21 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %r14d
; SSE2-NEXT: orl %ebp, %r14d
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: orl %ebp, %eax
; SSE2-NEXT: shlq $32, %rax
@@ -2284,89 +2499,141 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; F16-LABEL: fptrunc_v8f64:
-; F16: # %bb.0:
-; F16-NEXT: pushq %rbp
-; F16-NEXT: pushq %r15
-; F16-NEXT: pushq %r14
-; F16-NEXT: pushq %r13
-; F16-NEXT: pushq %r12
-; F16-NEXT: pushq %rbx
-; F16-NEXT: subq $136, %rsp
-; F16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; F16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; F16-NEXT: vzeroupper
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; F16-NEXT: vextractf128 $1, %ymm0, %xmm0
-; F16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; F16-NEXT: vzeroupper
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[1,0]
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; F16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; F16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; F16-NEXT: vzeroupper
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[1,0]
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; F16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; F16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; F16-NEXT: vzeroupper
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[1,0]
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; F16-NEXT: vmovd %xmm0, %ebp
-; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; F16-NEXT: vmovd %xmm0, %r14d
-; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; F16-NEXT: vmovd %xmm0, %r15d
-; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; F16-NEXT: vmovd %xmm0, %r12d
-; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; F16-NEXT: vmovd %xmm0, %r13d
-; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; F16-NEXT: # xmm0 = mem[0],zero,zero,zero
-; F16-NEXT: vmovd %xmm0, %ebx
-; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; F16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; F16-NEXT: vzeroupper
-; F16-NEXT: callq __truncdfbf2 at PLT
-; F16-NEXT: vmovd %xmm0, %eax
-; F16-NEXT: vmovd %eax, %xmm0
-; F16-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0
-; F16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
-; F16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
-; F16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
-; F16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
-; F16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
-; F16-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; F16-NEXT: addq $136, %rsp
-; F16-NEXT: popq %rbx
-; F16-NEXT: popq %r12
-; F16-NEXT: popq %r13
-; F16-NEXT: popq %r14
-; F16-NEXT: popq %r15
-; F16-NEXT: popq %rbp
-; F16-NEXT: retq
+; BF16-LABEL: fptrunc_v8f64:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbp
+; BF16-NEXT: pushq %r15
+; BF16-NEXT: pushq %r14
+; BF16-NEXT: pushq %r13
+; BF16-NEXT: pushq %r12
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: subq $184, %rsp
+; BF16-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; BF16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; BF16-NEXT: vzeroupper
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; BF16-NEXT: vextractf128 $1, %ymm0, %xmm0
+; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vzeroupper
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; BF16-NEXT: # xmm0 = mem[1,0]
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; BF16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; BF16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BF16-NEXT: vzeroupper
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; BF16-NEXT: # xmm0 = mem[1,0]
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; BF16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; BF16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BF16-NEXT: vzeroupper
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BF16-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; BF16-NEXT: # xmm0 = mem[1,0]
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: movl %eax, (%rsp) # 4-byte Spill
+; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BF16-NEXT: vpextrw $0, %xmm0, %ebp
+; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BF16-NEXT: vpextrw $0, %xmm0, %r14d
+; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BF16-NEXT: vpextrw $0, %xmm0, %r15d
+; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BF16-NEXT: vpextrw $0, %xmm0, %r12d
+; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BF16-NEXT: vpextrw $0, %xmm0, %r13d
+; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; BF16-NEXT: vpextrw $0, %xmm0, %ebx
+; BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; BF16-NEXT: # xmm0 = mem[1,0]
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: vmovd %ebx, %xmm0
+; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; BF16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
+; BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
+; BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
+; BF16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
+; BF16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
+; BF16-NEXT: vpinsrw $7, (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; BF16-NEXT: addq $184, %rsp
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: popq %r12
+; BF16-NEXT: popq %r13
+; BF16-NEXT: popq %r14
+; BF16-NEXT: popq %r15
+; BF16-NEXT: popq %rbp
+; BF16-NEXT: retq
+;
+; FP16-LABEL: fptrunc_v8f64:
+; FP16: # %bb.0:
+; FP16-NEXT: subq $184, %rsp
+; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
+; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; FP16-NEXT: vzeroupper
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; FP16-NEXT: # xmm0 = mem[1,0]
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; FP16-NEXT: vzeroupper
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; FP16-NEXT: vzeroupper
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; FP16-NEXT: vzeroupper
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; FP16-NEXT: addq $184, %rsp
+; FP16-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v8f64:
; AVXNC: # %bb.0:
@@ -2376,74 +2643,68 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; AVXNC-NEXT: pushq %r13
; AVXNC-NEXT: pushq %r12
; AVXNC-NEXT: pushq %rbx
-; AVXNC-NEXT: subq $120, %rsp
+; AVXNC-NEXT: subq $168, %rsp
; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVXNC-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero
-; AVXNC-NEXT: vmovd %xmm0, %ebp
-; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero
-; AVXNC-NEXT: vmovd %xmm0, %r14d
-; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero
-; AVXNC-NEXT: vmovd %xmm0, %r15d
-; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero
-; AVXNC-NEXT: vmovd %xmm0, %r12d
-; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero
-; AVXNC-NEXT: vmovd %xmm0, %r13d
-; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero
-; AVXNC-NEXT: vmovd %xmm0, %ebx
-; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVXNC-NEXT: vzeroupper
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d
+; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
+; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovd %xmm0, %eax
-; AVXNC-NEXT: vmovd %eax, %xmm0
-; AVXNC-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: vmovd %ebx, %xmm0
+; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVXNC-NEXT: addq $120, %rsp
+; AVXNC-NEXT: addq $168, %rsp
; AVXNC-NEXT: popq %rbx
; AVXNC-NEXT: popq %r12
; AVXNC-NEXT: popq %r13
More information about the flang-commits
mailing list