[llvm] 139bcda - [X86] SimplifyDemandedVectorEltsForTargetNode - add basic CVTPH2PS/CVTPS2PH handling
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 29 04:34:21 PST 2024
Author: Simon Pilgrim
Date: 2024-02-29T12:33:49Z
New Revision: 139bcda542514b7a064fe9225014ec4268bb2b65
URL: https://github.com/llvm/llvm-project/commit/139bcda542514b7a064fe9225014ec4268bb2b65
DIFF: https://github.com/llvm/llvm-project/commit/139bcda542514b7a064fe9225014ec4268bb2b65.diff
LOG: [X86] SimplifyDemandedVectorEltsForTargetNode - add basic CVTPH2PS/CVTPS2PH handling
Allows us to peek through the F16 conversion nodes, mainly to simplify shuffles
An easy part of #83414
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/test/CodeGen/X86/cvt16.ll
llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
llvm/test/CodeGen/X86/fp-roundeven.ll
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/pr31088.ll
llvm/test/CodeGen/X86/pr57340.ll
llvm/test/CodeGen/X86/prefer-fpext-splat.ll
llvm/test/CodeGen/X86/vector-half-conversions.ll
llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 93088c7cde938b..d98d914894a3f8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41402,7 +41402,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
break;
}
case X86ISD::CVTSI2P:
- case X86ISD::CVTUI2P: {
+ case X86ISD::CVTUI2P:
+ case X86ISD::CVTPH2PS:
+ case X86ISD::CVTPS2PH: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
APInt SrcUndef, SrcZero;
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 22aae4de4db9d2..3e40bfa1e791d0 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2171,14 +2171,13 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
; KNL-LABEL: test_concat_v2i1:
; KNL: ## %bb.0:
; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; KNL-NEXT: vcvtph2ps %xmm1, %xmm1
+; KNL-NEXT: vcvtph2ps %xmm0, %xmm1
; KNL-NEXT: vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
; KNL-NEXT: vucomiss %xmm2, %xmm1
; KNL-NEXT: setb %al
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
; KNL-NEXT: vucomiss %xmm2, %xmm0
; KNL-NEXT: setb %al
@@ -2207,14 +2206,13 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
; SKX-LABEL: test_concat_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,1,1,4,5,6,7]
; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
; SKX-NEXT: vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
; SKX-NEXT: vucomiss %xmm2, %xmm1
; SKX-NEXT: setb %al
; SKX-NEXT: kmovd %eax, %k0
; SKX-NEXT: kshiftlb $1, %k0, %k0
-; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
; SKX-NEXT: vucomiss %xmm2, %xmm0
; SKX-NEXT: setb %al
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index f3c728a990f514..86ebb1e40870f8 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1436,9 +1436,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; KNL: ## %bb.0: ## %entry
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; KNL-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
-; KNL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; KNL-NEXT: vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
+; KNL-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; KNL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1448,8 +1447,6 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; KNL-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
; KNL-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
; KNL-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; KNL-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
-; KNL-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; KNL-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
; KNL-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1466,9 +1463,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; AVX512BW: ## %bb.0: ## %entry
; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
-; AVX512BW-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; AVX512BW-NEXT: vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
+; AVX512BW-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; AVX512BW-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1478,8 +1474,6 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; AVX512BW-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
; AVX512BW-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
; AVX512BW-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; AVX512BW-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
-; AVX512BW-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
; AVX512BW-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1496,9 +1490,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; SKX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; SKX-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
-; SKX-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; SKX-NEXT: vpshuflw $85, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xc8,0x55]
+; SKX-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
; SKX-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
@@ -1507,8 +1500,6 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
; SKX-NEXT: testb %cl, %cl ## encoding: [0x84,0xc9]
; SKX-NEXT: setne %al ## encoding: [0x0f,0x95,0xc0]
-; SKX-NEXT: vpmovzxwq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xc0]
-; SKX-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; SKX-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
; SKX-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1]
diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index 59097f8fb5d247..c7ef353f7f6038 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -89,7 +89,6 @@ define float @test3(float %src) nounwind uwtable readnone {
; F16C-LABEL: test3:
; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
index e114c205d7972b..1886e2911ede80 100644
--- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -18,8 +18,7 @@ define float @test_cvtsh_ss(i16 %a0) nounwind {
;
; X64-LABEL: test_cvtsh_ss:
; X64: # %bb.0:
-; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vcvtph2ps %xmm0, %xmm0
; X64-NEXT: retq
%ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
@@ -41,8 +40,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
; X86-LABEL: test_cvtss_sh:
; X86: # %bb.0:
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
@@ -50,8 +47,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
;
; X64-LABEL: test_cvtss_sh:
; X64: # %bb.0:
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 5f326b6d6998fb..8f875c70a25f6d 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1432,7 +1432,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
@@ -1447,7 +1446,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
@@ -1550,7 +1548,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax
; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
@@ -1566,7 +1563,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; CHECK-FMA-NEXT: movzwl %ax, %eax
; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
index fed2060dabd3af..8037c783dd8e67 100644
--- a/llvm/test/CodeGen/X86/fp-roundeven.ll
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -51,7 +51,6 @@ define half @roundeven_f16(half %h) {
; AVX512F-LABEL: roundeven_f16:
; AVX512F: ## %bb.0: ## %entry
; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index a3fb71f817ce47..6aad4c2ebba1d8 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -698,24 +698,23 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
;
; AVX2-LABEL: stest_f16i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rcx
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -837,7 +836,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
;
; AVX2-LABEL: utesth_f16i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm2
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -846,29 +845,28 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vcvttss2si %xmm3, %rax
-; AVX2-NEXT: vmovq %rdx, %xmm3
; AVX2-NEXT: vcvttss2si %xmm2, %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm4
+; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
; AVX2-NEXT: vcvttss2si %xmm4, %rax
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm2, %rcx
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
+; AVX2-NEXT: vcvttss2si %xmm3, %rcx
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
@@ -879,7 +877,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
@@ -1001,24 +999,23 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
;
; AVX2-LABEL: ustest_f16i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3313,24 +3310,23 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
;
; AVX2-LABEL: stest_f16i32_mm:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rcx
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3450,7 +3446,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
;
; AVX2-LABEL: utesth_f16i32_mm:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm2
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -3459,29 +3455,28 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vcvttss2si %xmm3, %rax
-; AVX2-NEXT: vmovq %rdx, %xmm3
; AVX2-NEXT: vcvttss2si %xmm2, %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm4
+; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
; AVX2-NEXT: vcvttss2si %xmm4, %rax
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm2, %rcx
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
+; AVX2-NEXT: vcvttss2si %xmm3, %rcx
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
@@ -3492,7 +3487,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1
@@ -3613,24 +3608,23 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
;
; AVX2-LABEL: ustest_f16i32_mm:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 2e1322446032ff..9f01d07e6a6705 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -851,16 +851,14 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
;
; BWON-F16C-LABEL: test_sitofp_fadd_i32:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movzwl (%rsi), %eax
; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: movzwl (%rsi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm1
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: retq
;
@@ -919,7 +917,6 @@ define half @PR40273(half) #0 {
; BWON-F16C-LABEL: PR40273:
; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: xorl %eax, %eax
@@ -973,7 +970,6 @@ define void @brcond(half %0) #0 {
; BWON-F16C-LABEL: brcond:
; BWON-F16C: # %bb.0: # %entry
; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -1029,7 +1025,6 @@ define half @test_sqrt(half %0) #0 {
; BWON-F16C-LABEL: test_sqrt:
; BWON-F16C: # %bb.0: # %entry
; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
@@ -1083,7 +1078,6 @@ define void @main.158() #0 {
; BWON-F16C: # %bb.0: # %entry
; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2
@@ -1172,8 +1166,7 @@ define void @main.45() #0 {
;
; BWON-F16C-LABEL: main.45:
; BWON-F16C: # %bb.0: # %entry
-; BWON-F16C-NEXT: movzwl (%rax), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: xorl %eax, %eax
@@ -1345,10 +1338,8 @@ define half @pr61271(half %0, half %1) #0 {
; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
; BWON-F16C-NEXT: vpextrw $0, %xmm1, %ecx
-; BWON-F16C-NEXT: movzwl %cx, %ecx
; BWON-F16C-NEXT: vmovd %ecx, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT: movzwl %ax, %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm1
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; BWON-F16C-NEXT: vminss %xmm0, %xmm1, %xmm0
@@ -1614,10 +1605,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
;
; BWON-F16C-LABEL: maxnum_v8f16:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT: vpshufb %xmm3, %xmm1, %xmm2
+; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: vpshufb %xmm3, %xmm0, %xmm3
+; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
; BWON-F16C-NEXT: vucomiss %xmm2, %xmm3
; BWON-F16C-NEXT: ja .LBB26_2
@@ -1625,10 +1615,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vmovaps %xmm2, %xmm3
; BWON-F16C-NEXT: .LBB26_2:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2
-; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT: vpshufb %xmm4, %xmm1, %xmm3
+; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vpshufb %xmm4, %xmm0, %xmm4
+; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; BWON-F16C-NEXT: vcvtph2ps %xmm4, %xmm4
; BWON-F16C-NEXT: vucomiss %xmm3, %xmm4
; BWON-F16C-NEXT: ja .LBB26_4
@@ -1638,10 +1627,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vmovd %xmm2, %eax
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm4, %xmm2
; BWON-F16C-NEXT: vmovd %xmm2, %ecx
-; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm3
+; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; BWON-F16C-NEXT: vucomiss %xmm3, %xmm2
; BWON-F16C-NEXT: ja .LBB26_6
@@ -1650,9 +1638,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: .LBB26_6:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; BWON-F16C-NEXT: vmovd %xmm2, %edx
-; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm3
-; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; BWON-F16C-NEXT: vucomiss %xmm3, %xmm2
; BWON-F16C-NEXT: ja .LBB26_8
@@ -1661,10 +1649,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: .LBB26_8:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; BWON-F16C-NEXT: vmovd %xmm2, %esi
-; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT: vpshufb %xmm3, %xmm1, %xmm2
+; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: vpshufb %xmm3, %xmm0, %xmm3
+; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm6
; BWON-F16C-NEXT: vucomiss %xmm2, %xmm6
; BWON-F16C-NEXT: ja .LBB26_10
@@ -1677,9 +1664,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vpinsrw $0, %esi, %xmm0, %xmm5
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm6
; BWON-F16C-NEXT: vmovd %xmm6, %eax
-; BWON-F16C-NEXT: vpsrlq $48, %xmm1, %xmm6
+; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm7
-; BWON-F16C-NEXT: vpsrlq $48, %xmm0, %xmm6
+; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6
; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6
; BWON-F16C-NEXT: ja .LBB26_12
@@ -1687,29 +1674,26 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vmovaps %xmm7, %xmm6
; BWON-F16C-NEXT: .LBB26_12:
; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm5
; BWON-F16C-NEXT: vmovd %xmm5, %eax
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5
-; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; BWON-F16C-NEXT: vcvtph2ps %xmm7, %xmm7
-; BWON-F16C-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm7
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm6
; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6
; BWON-F16C-NEXT: ja .LBB26_14
; BWON-F16C-NEXT: # %bb.13:
; BWON-F16C-NEXT: vmovaps %xmm7, %xmm6
; BWON-F16C-NEXT: .LBB26_14:
-; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm4
; BWON-F16C-NEXT: vmovd %xmm4, %eax
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
-; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0
; BWON-F16C-NEXT: ja .LBB26_16
@@ -1719,7 +1703,7 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; BWON-F16C-NEXT: vmovd %xmm0, %eax
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; BWON-F16C-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; BWON-F16C-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index a21653bc7330c9..ce37622c476db4 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -41,9 +41,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
;
; F16C-LABEL: ir_fadd_v1f16:
; F16C: # %bb.0:
-; F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -54,13 +52,15 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
; F16C-O0-LABEL: ir_fadd_v1f16:
; F16C-O0: # %bb.0:
; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax
-; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT: movzwl %ax, %eax
+; F16C-O0-NEXT: movw %ax, %cx
+; F16C-O0-NEXT: # implicit-def: $eax
+; F16C-O0-NEXT: movw %cx, %ax
; F16C-O0-NEXT: vmovd %eax, %xmm1
; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT: movzwl %ax, %eax
+; F16C-O0-NEXT: movw %ax, %cx
+; F16C-O0-NEXT: # implicit-def: $eax
+; F16C-O0-NEXT: movw %cx, %ax
; F16C-O0-NEXT: vmovd %eax, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index 95f839c338e701..00a52c639e43c6 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -5,29 +5,28 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-LABEL: main.41:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastw (%rax), %xmm0
-; CHECK-NEXT: vmovdqu (%rax), %ymm1
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vmovdqu (%rax), %xmm10
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT: vpshufb %xmm1, %xmm10, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm4
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT: vucomiss %xmm4, %xmm2
-; CHECK-NEXT: setnp %al
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k0
+; CHECK-NEXT: vpextrw $0, %xmm0, %eax
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqu (%rax), %ymm3
+; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: vpermi2w %ymm1, %ymm3, %ymm2
+; CHECK-NEXT: vprold $16, %xmm2, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm3
+; CHECK-NEXT: vmovdqu (%rax), %xmm5
+; CHECK-NEXT: vprold $16, %xmm5, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT: vucomiss %xmm3, %xmm1
+; CHECK-NEXT: setnp %cl
+; CHECK-NEXT: sete %dl
+; CHECK-NEXT: testb %cl, %dl
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: kmovd %ecx, %k0
; CHECK-NEXT: kshiftlw $15, %k0, %k0
+; CHECK-NEXT: vmovd %eax, %xmm3
+; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3
+; CHECK-NEXT: vcvtph2ps %xmm5, %xmm6
; CHECK-NEXT: kshiftrw $14, %k0, %k0
-; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm11
-; CHECK-NEXT: vucomiss %xmm0, %xmm11
+; CHECK-NEXT: vucomiss %xmm3, %xmm6
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -38,10 +37,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-5, %ax
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
-; CHECK-NEXT: vucomiss %xmm5, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT: vucomiss %xmm3, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -52,12 +51,12 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-9, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpsrlq $48, %xmm3, %xmm5
-; CHECK-NEXT: vcvtph2ps %xmm5, %xmm6
-; CHECK-NEXT: vpsrlq $48, %xmm10, %xmm5
-; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm6, %xmm5
+; CHECK-NEXT: vprolq $16, %xmm2, %xmm3
+; CHECK-NEXT: vcvtph2ps %xmm3, %xmm4
+; CHECK-NEXT: vprolq $16, %xmm5, %xmm3
+; CHECK-NEXT: vcvtph2ps %xmm3, %xmm3
+; CHECK-NEXT: vucomiss %xmm4, %xmm3
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -68,11 +67,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-17, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT: vucomiss %xmm7, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
+; CHECK-NEXT: vucomiss %xmm4, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -83,13 +81,12 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-33, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT: vpshufb %xmm7, %xmm10, %xmm8
-; CHECK-NEXT: vcvtph2ps %xmm8, %xmm8
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm4, %xmm7
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufb %xmm7, %xmm3, %xmm9
-; CHECK-NEXT: vcvtph2ps %xmm9, %xmm9
-; CHECK-NEXT: vucomiss %xmm9, %xmm8
+; CHECK-NEXT: vucomiss %xmm7, %xmm4
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -100,11 +97,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-65, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT: vpshufb %xmm9, %xmm3, %xmm12
-; CHECK-NEXT: vcvtph2ps %xmm12, %xmm12
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm12, %xmm0
+; CHECK-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
+; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
+; CHECK-NEXT: vucomiss %xmm7, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -116,11 +112,11 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-129, %ax
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm12 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm12, %xmm12
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm10, %xmm10
-; CHECK-NEXT: vucomiss %xmm12, %xmm10
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
+; CHECK-NEXT: vucomiss %xmm7, %xmm5
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -131,11 +127,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-257, %ax # imm = 0xFEFF
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm3
-; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; CHECK-NEXT: vcvtph2ps %xmm12, %xmm12
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT: vcvtph2ps %xmm2, %xmm7
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm12, %xmm11
+; CHECK-NEXT: vucomiss %xmm7, %xmm6
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -147,9 +142,9 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-513, %ax # imm = 0xFDFF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
+; CHECK-NEXT: vprold $16, %xmm2, %xmm6
+; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6
+; CHECK-NEXT: vucomiss %xmm6, %xmm1
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -161,7 +156,7 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-1025, %ax # imm = 0xFBFF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
; CHECK-NEXT: vucomiss %xmm1, %xmm0
; CHECK-NEXT: setnp %al
@@ -175,9 +170,9 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-2049, %ax # imm = 0xF7FF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrlq $48, %xmm3, %xmm1
+; CHECK-NEXT: vprolq $16, %xmm2, %xmm1
; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm5
+; CHECK-NEXT: vucomiss %xmm1, %xmm3
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -189,7 +184,7 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-4097, %ax # imm = 0xEFFF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufb %xmm6, %xmm3, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
; CHECK-NEXT: vucomiss %xmm1, %xmm0
; CHECK-NEXT: setnp %al
@@ -203,9 +198,9 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-8193, %ax # imm = 0xDFFF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufb %xmm7, %xmm3, %xmm1
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT: vucomiss %xmm1, %xmm8
+; CHECK-NEXT: vucomiss %xmm1, %xmm4
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -216,7 +211,7 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-16385, %ax # imm = 0xBFFF
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpshufb %xmm9, %xmm3, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
; CHECK-NEXT: kandw %k1, %k0, %k0
; CHECK-NEXT: vucomiss %xmm1, %xmm0
@@ -228,10 +223,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: kshiftlw $14, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kshiftlw $1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-NEXT: kshiftrw $1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm0, %xmm10
+; CHECK-NEXT: vucomiss %xmm0, %xmm5
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
diff --git a/llvm/test/CodeGen/X86/prefer-fpext-splat.ll b/llvm/test/CodeGen/X86/prefer-fpext-splat.ll
index 1d8b8b3f9a96ec..c3d7b2e15d0170 100644
--- a/llvm/test/CodeGen/X86/prefer-fpext-splat.ll
+++ b/llvm/test/CodeGen/X86/prefer-fpext-splat.ll
@@ -176,8 +176,6 @@ define <2 x double> @prefer_f16_v2f64(ptr %p) nounwind {
; AVX512F-LABEL: prefer_f16_v2f64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vcvtps2pd %xmm0, %xmm0
; AVX512F-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 3b82df5d5b74d2..ba21af231985a1 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -21,15 +21,13 @@ define float @cvt_i16_to_f32(i16 %a0) nounwind {
;
; F16C-LABEL: cvt_i16_to_f32:
; F16C: # %bb.0:
-; F16C-NEXT: movzwl %di, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vmovd %edi, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: retq
;
; AVX512-LABEL: cvt_i16_to_f32:
; AVX512: # %bb.0:
-; AVX512-NEXT: movzwl %di, %eax
-; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vmovd %edi, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = bitcast i16 %a0 to half
@@ -1370,16 +1368,14 @@ define double @cvt_i16_to_f64(i16 %a0) nounwind {
;
; F16C-LABEL: cvt_i16_to_f64:
; F16C: # %bb.0:
-; F16C-NEXT: movzwl %di, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vmovd %edi, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; F16C-NEXT: retq
;
; AVX512-LABEL: cvt_i16_to_f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: movzwl %di, %eax
-; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vmovd %edi, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -1410,14 +1406,12 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
;
; F16C-LABEL: cvt_2i16_to_2f64:
; F16C: # %bb.0:
-; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtps2pd %xmm0, %xmm0
; F16C-NEXT: retq
;
; AVX512-LABEL: cvt_2i16_to_2f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -1503,14 +1497,12 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
;
; F16C-LABEL: cvt_8i16_to_2f64:
; F16C: # %bb.0:
-; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtps2pd %xmm0, %xmm0
; F16C-NEXT: retq
;
; AVX512-LABEL: cvt_8i16_to_2f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -1877,16 +1869,14 @@ define <2 x double> @load_cvt_2i16_to_2f64(ptr %a0) nounwind {
;
; F16C-LABEL: load_cvt_2i16_to_2f64:
; F16C: # %bb.0:
-; F16C-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtps2pd %xmm0, %xmm0
; F16C-NEXT: retq
;
; AVX512-LABEL: load_cvt_2i16_to_2f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -4976,9 +4966,9 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
;
; F16C-LABEL: fptosi_2f16_to_4i32:
; F16C: # %bb.0:
-; F16C-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; F16C-NEXT: vpsrld $16, %xmm0, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; F16C-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
@@ -4987,9 +4977,9 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index 6e8eefc607ee11..24113441a4e25a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -413,10 +413,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm3
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: vucomiss %xmm3, %xmm2
; AVX512F-NEXT: movl $255, %ecx
@@ -430,10 +428,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512VL-LABEL: test_v2f16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm3
; AVX512VL-NEXT: xorl %eax, %eax
; AVX512VL-NEXT: vucomiss %xmm3, %xmm2
; AVX512VL-NEXT: movl $255, %ecx
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 804ca183ad4c9d..edefb16d40e6ed 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -412,10 +412,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm3
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: vucomiss %xmm3, %xmm2
; AVX512F-NEXT: movl $255, %ecx
@@ -429,10 +427,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512VL-LABEL: test_v2f16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm3
; AVX512VL-NEXT: xorl %eax, %eax
; AVX512VL-NEXT: vucomiss %xmm3, %xmm2
; AVX512VL-NEXT: movl $255, %ecx
More information about the llvm-commits
mailing list