[llvm] 02fe96b - [X86][FP16] Do not split FP64->FP16 to FP64->FP32->FP16
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 21 17:36:41 PDT 2022
Author: Phoebe Wang
Date: 2022-07-22T08:36:05+08:00
New Revision: 02fe96b24018bb8ce65cb264e0621459507cf989
URL: https://github.com/llvm/llvm-project/commit/02fe96b24018bb8ce65cb264e0621459507cf989
DIFF: https://github.com/llvm/llvm-project/commit/02fe96b24018bb8ce65cb264e0621459507cf989.diff
LOG: [X86][FP16] Do not split FP64->FP16 to FP64->FP32->FP16
Truncation from double to half is not always identical to truncating to float first and then to half. https://godbolt.org/z/56s9517hd
On the other hand, expanding to float and then to double is always identical to expanding to double directly. https://godbolt.org/z/Ye8vbYPnY
Reviewed By: RKSimon, skan
Differential Revision: https://reviews.llvm.org/D130151
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/cvt16.ll
llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
llvm/test/CodeGen/X86/half-constrained.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/vector-half-conversions.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 09786a9a3ca5a..a54e899a1cc2c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22914,33 +22914,14 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
- SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
return SDValue();
- if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT)) {
- if (Subtarget.hasFP16())
- return Op;
-
- if (SVT.getScalarType() != MVT::f32) {
- MVT TmpVT =
- VT.isVector() ? SVT.changeVectorElementType(MVT::f32) : MVT::f32;
- if (IsStrict)
- return DAG.getNode(
- ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
- {Chain,
- DAG.getNode(ISD::STRICT_FP_ROUND, DL, {TmpVT, MVT::Other},
- {Chain, In, Op2}),
- Op2});
-
- return DAG.getNode(ISD::FP_ROUND, DL, VT,
- DAG.getNode(ISD::FP_ROUND, DL, TmpVT, In, Op2), Op2);
- }
-
- if (!Subtarget.hasF16C())
+ if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
+ if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
return SDValue();
if (VT.isVector())
@@ -32983,19 +32964,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
- if (SrcVT == MVT::v2f64) {
- if (IsStrict)
- Src = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
- {MVT::v4f32, MVT::Other}, {Chain, Src});
- else
- Src = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Src);
- } else if (SrcVT == MVT::v4f64) {
- if (IsStrict)
- Src = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {MVT::v4f32, MVT::Other},
- {Chain, Src, Rnd});
- else
- Src = DAG.getNode(ISD::FP_ROUND, dl, MVT::v4f32, Src, Rnd);
- }
+ if (SrcVT.getVectorElementType() != MVT::f32)
+ return;
if (IsStrict)
V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index 5af60b3c59964..a4f70d958ecc6 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -154,10 +154,11 @@ define i16 @test5(double %src) nounwind {
;
; F16C-LABEL: test5:
; F16C: # %bb.0:
-; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: pushq %rax
+; F16C-NEXT: callq __truncdfhf2 at PLT
+; F16C-NEXT: vpextrw $0, %xmm0, %eax
; F16C-NEXT: # kill: def $ax killed $ax killed $eax
+; F16C-NEXT: popq %rcx
; F16C-NEXT: retq
;
; SOFTFLOAT-LABEL: test5:
diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 1069595449bbd..5afa12cd4b477 100644
--- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -1,24 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL --check-prefix=F16C
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL
define zeroext i16 @test1_fast(double %d) #0 {
-; F16C-LABEL: test1_fast:
-; F16C: # %bb.0: # %entry
-; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: # kill: def $ax killed $ax killed $eax
-; F16C-NEXT: retq
-;
-; AVX-LABEL: test1_fast:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushq %rax
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vpextrw $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: popq %rcx
-; AVX-NEXT: retq
+; ALL-LABEL: test1_fast:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vpextrw $0, %xmm0, %eax
+; ALL-NEXT: # kill: def $ax killed $ax killed $eax
+; ALL-NEXT: popq %rcx
+; ALL-NEXT: retq
entry:
%0 = tail call i16 @llvm.convert.to.fp16.f64(double %d)
ret i16 %0
@@ -41,22 +33,14 @@ entry:
}
define zeroext i16 @test1(double %d) #1 {
-; F16C-LABEL: test1:
-; F16C: # %bb.0: # %entry
-; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: # kill: def $ax killed $ax killed $eax
-; F16C-NEXT: retq
-;
-; AVX-LABEL: test1:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushq %rax
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vpextrw $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: popq %rcx
-; AVX-NEXT: retq
+; ALL-LABEL: test1:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vpextrw $0, %xmm0, %eax
+; ALL-NEXT: # kill: def $ax killed $ax killed $eax
+; ALL-NEXT: popq %rcx
+; ALL-NEXT: retq
entry:
%0 = tail call i16 @llvm.convert.to.fp16.f64(double %d)
ret i16 %0
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index c09af463c9cb5..3ecddd5279814 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -357,13 +357,12 @@ define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
;
; AVX-LABEL: fptrunc_double_to_f16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movw %ax, (%rsi)
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: movq %rsi, %rbx
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX-NEXT: popq %rbx
; AVX-NEXT: retq
;
; X86-LABEL: fptrunc_double_to_f16:
diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index ffd0035059b55..23e201936ddec 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -201,13 +201,13 @@ define void @double_to_half(double %0) strictfp {
;
; X32-F16C-LABEL: double_to_half:
; X32-F16C: ## %bb.0:
-; X32-F16C-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; X32-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X32-F16C-NEXT: vmovd %xmm0, %eax
-; X32-F16C-NEXT: movw %ax, _a
+; X32-F16C-NEXT: subl $12, %esp
+; X32-F16C-NEXT: .cfi_def_cfa_offset 16
+; X32-F16C-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-F16C-NEXT: vmovq %xmm0, (%esp)
+; X32-F16C-NEXT: calll ___truncdfhf2
+; X32-F16C-NEXT: vpextrw $0, %xmm0, _a
+; X32-F16C-NEXT: addl $12, %esp
; X32-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: double_to_half:
@@ -222,12 +222,11 @@ define void @double_to_half(double %0) strictfp {
;
; X64-F16C-LABEL: double_to_half:
; X64-F16C: ## %bb.0:
-; X64-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT: vmovd %xmm0, %eax
-; X64-F16C-NEXT: movw %ax, _a(%rip)
+; X64-F16C-NEXT: pushq %rax
+; X64-F16C-NEXT: .cfi_def_cfa_offset 16
+; X64-F16C-NEXT: callq ___truncdfhf2
+; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip)
+; X64-F16C-NEXT: popq %rax
; X64-F16C-NEXT: retq
%2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
store half %2, ptr @a, align 2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index d2b9e51bba09b..9676bf2f9764a 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -183,10 +183,11 @@ define void @test_trunc64(double %in, ptr %addr) #0 {
;
; BWON-F16C-LABEL: test_trunc64:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vmovd %xmm0, %eax
-; BWON-F16C-NEXT: movw %ax, (%rdi)
+; BWON-F16C-NEXT: pushq %rbx
+; BWON-F16C-NEXT: movq %rdi, %rbx
+; BWON-F16C-NEXT: callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
+; BWON-F16C-NEXT: popq %rbx
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc64:
@@ -681,9 +682,36 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
;
; BWON-F16C-LABEL: test_trunc64_vec4:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: vcvtpd2ps %ymm0, %xmm0
-; BWON-F16C-NEXT: vcvtps2ph $0, %xmm0, (%rdi)
+; BWON-F16C-NEXT: pushq %rbx
+; BWON-F16C-NEXT: subq $64, %rsp
+; BWON-F16C-NEXT: movq %rdi, %rbx
+; BWON-F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0
+; BWON-F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BWON-F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; BWON-F16C-NEXT: # xmm0 = mem[1,0]
+; BWON-F16C-NEXT: callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; BWON-F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BWON-F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; BWON-F16C-NEXT: # xmm0 = mem[1,0]
+; BWON-F16C-NEXT: callq __truncdfhf2 at PLT
+; BWON-F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; BWON-F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; BWON-F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; BWON-F16C-NEXT: vmovq %xmm0, (%rbx)
+; BWON-F16C-NEXT: addq $64, %rsp
+; BWON-F16C-NEXT: popq %rbx
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc64_vec4:
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 23abe9bac5e4d..ac14b415f35ba 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -589,10 +589,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
define i16 @cvt_f64_to_i16(double %a0) nounwind {
; ALL-LABEL: cvt_f64_to_i16:
; ALL: # %bb.0:
-; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vpextrw $0, %xmm0, %eax
; ALL-NEXT: # kill: def $ax killed $ax killed $eax
+; ALL-NEXT: popq %rcx
; ALL-NEXT: retq
%1 = fptrunc double %a0 to half
%2 = bitcast half %1 to i16
@@ -600,35 +601,159 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind {
}
define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
-; ALL-LABEL: cvt_2f64_to_2i16:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %xmm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; ALL-NEXT: retq
+; AVX-LABEL: cvt_2f64_to_2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
%2 = bitcast <2 x half> %1 to <2 x i16>
ret <2 x i16> %2
}
define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
-; ALL-LABEL: cvt_4f64_to_4i16:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX-LABEL: cvt_4f64_to_4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: subq $72, %rsp
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
+; AVX-NEXT: addq $72, %rsp
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_4i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: subq $72, %rsp
+; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
+; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
}
define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
-; ALL-LABEL: cvt_4f64_to_8i16_undef:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX-LABEL: cvt_4f64_to_8i16_undef:
+; AVX: # %bb.0:
+; AVX-NEXT: subq $72, %rsp
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
+; AVX-NEXT: addq $72, %rsp
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_8i16_undef:
+; AVX512: # %bb.0:
+; AVX512-NEXT: subq $72, %rsp
+; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
+; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -638,9 +763,32 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; ALL-LABEL: cvt_4f64_to_8i16_zero:
; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; ALL-NEXT: subq $72, %rsp
+; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; ALL-NEXT: vzeroupper
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
+; ALL-NEXT: addq $72, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
@@ -651,19 +799,118 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX-LABEL: cvt_8f64_to_8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1
-; AVX-NEXT: vcvtps2ph $0, %xmm1, %xmm1
-; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: subq $104, %rsp
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: addq $104, %rsp
; AVX-NEXT: retq
;
; AVX512-LABEL: cvt_8f64_to_8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtpd2ps %zmm0, %ymm0
-; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT: subq $120, %rsp
+; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT: addq $120, %rsp
; AVX512-NEXT: retq
%1 = fptrunc <8 x double> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
@@ -677,10 +924,11 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind {
; ALL-LABEL: store_cvt_f64_to_i16:
; ALL: # %bb.0:
-; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; ALL-NEXT: vmovd %xmm0, %eax
-; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vpextrw $0, %xmm0, (%rbx)
+; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc double %a0 to half
%2 = bitcast half %1 to i16
@@ -691,9 +939,20 @@ define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind {
define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind {
; ALL-LABEL: store_cvt_2f64_to_2i16:
; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %xmm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; ALL-NEXT: vmovss %xmm0, (%rdi)
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: subq $32, %rsp
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vmovd %xmm0, (%rbx)
+; ALL-NEXT: addq $32, %rsp
+; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
%2 = bitcast <2 x half> %1 to <2 x i16>
@@ -704,9 +963,36 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind {
define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
; ALL-LABEL: store_cvt_4f64_to_4i16:
; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, (%rdi)
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; ALL-NEXT: vzeroupper
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; ALL-NEXT: vmovq %xmm0, (%rbx)
+; ALL-NEXT: addq $64, %rsp
+; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
@@ -715,13 +1001,78 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
}
define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
-; ALL-LABEL: store_cvt_4f64_to_8i16_undef:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; ALL-NEXT: vmovaps %xmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX: # %bb.0:
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
+; AVX-NEXT: vmovaps %xmm0, (%rbx)
+; AVX-NEXT: addq $64, %rsp
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $64, %rsp
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rbx)
+; AVX512-NEXT: addq $64, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -732,10 +1083,36 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
; ALL-LABEL: store_cvt_4f64_to_8i16_zero:
; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; ALL-NEXT: vmovaps %xmm0, (%rdi)
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; ALL-NEXT: vzeroupper
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = mem[1,0]
+; ALL-NEXT: callq __truncdfhf2 at PLT
+; ALL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; ALL-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; ALL-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
+; ALL-NEXT: vmovaps %xmm0, (%rbx)
+; ALL-NEXT: addq $64, %rsp
+; ALL-NEXT: popq %rbx
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
@@ -747,20 +1124,126 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
; AVX-LABEL: store_cvt_8f64_to_8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1
-; AVX-NEXT: vcvtps2ph $0, %xmm1, %xmm1
-; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vmovaps %xmm0, (%rdi)
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: subq $96, %rsp
+; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncdfhf2 at PLT
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovdqa %xmm0, (%rbx)
+; AVX-NEXT: addq $96, %rsp
+; AVX-NEXT: popq %rbx
; AVX-NEXT: retq
;
; AVX512-LABEL: store_cvt_8f64_to_8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtpd2ps %zmm0, %ymm0
-; AVX512-NEXT: vcvtps2ph $4, %ymm0, (%rdi)
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $112, %rsp
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rbx)
+; AVX512-NEXT: addq $112, %rsp
+; AVX512-NEXT: popq %rbx
; AVX512-NEXT: retq
%1 = fptrunc <8 x double> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
More information about the llvm-commits
mailing list