[llvm] 7ff3f97 - [X86] getFauxShuffleMask - handle insert_vector_elt(bitcast(extract_vector_elt(x))) shuffle patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 29 02:33:41 PST 2024
Author: Simon Pilgrim
Date: 2024-02-29T10:32:49Z
New Revision: 7ff3f9760da7d7c8fe9209280aefb05168efcf20
URL: https://github.com/llvm/llvm-project/commit/7ff3f9760da7d7c8fe9209280aefb05168efcf20
DIFF: https://github.com/llvm/llvm-project/commit/7ff3f9760da7d7c8fe9209280aefb05168efcf20.diff
LOG: [X86] getFauxShuffleMask - handle insert_vector_elt(bitcast(extract_vector_elt(x))) shuffle patterns
If the bitcast is between types of equal scalar size (i.e. fp<->int bitcasts), then we can safely peek through them
Fixes #83289
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/pr31088.ll
llvm/test/CodeGen/X86/pr57340.ll
llvm/test/CodeGen/X86/vector-half-conversions.ll
llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bec13d1c00ef7d..93088c7cde938b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5878,13 +5878,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
}
- // Peek through trunc/aext/zext.
+ // Peek through trunc/aext/zext/bitcast.
// TODO: aext shouldn't require SM_SentinelZero padding.
// TODO: handle shift of scalars.
unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
while (Scl.getOpcode() == ISD::TRUNCATE ||
Scl.getOpcode() == ISD::ANY_EXTEND ||
- Scl.getOpcode() == ISD::ZERO_EXTEND) {
+ Scl.getOpcode() == ISD::ZERO_EXTEND ||
+ (Scl.getOpcode() == ISD::BITCAST &&
+ Scl.getScalarValueSizeInBits() ==
+ Scl.getOperand(0).getScalarValueSizeInBits())) {
Scl = Scl.getOperand(0);
MinBitsPerElt =
std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index abfe3e6428e663..22aae4de4db9d2 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2171,19 +2171,14 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
; KNL-LABEL: test_concat_v2i1:
; KNL: ## %bb.0:
; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: vpextrw $0, %xmm0, %eax
-; KNL-NEXT: movzwl %ax, %eax
-; KNL-NEXT: vmovd %eax, %xmm1
+; KNL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; KNL-NEXT: vcvtph2ps %xmm1, %xmm1
; KNL-NEXT: vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
; KNL-NEXT: vucomiss %xmm2, %xmm1
; KNL-NEXT: setb %al
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k0
-; KNL-NEXT: vpsrld $16, %xmm0, %xmm0
-; KNL-NEXT: vpextrw $0, %xmm0, %eax
-; KNL-NEXT: movzwl %ax, %eax
-; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
; KNL-NEXT: vucomiss %xmm2, %xmm0
; KNL-NEXT: setb %al
@@ -2212,19 +2207,14 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
; SKX-LABEL: test_concat_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: vpsrld $16, %xmm0, %xmm1
-; SKX-NEXT: vpextrw $0, %xmm1, %eax
-; SKX-NEXT: movzwl %ax, %eax
-; SKX-NEXT: vmovd %eax, %xmm1
+; SKX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
; SKX-NEXT: vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
; SKX-NEXT: vucomiss %xmm2, %xmm1
; SKX-NEXT: setb %al
; SKX-NEXT: kmovd %eax, %k0
; SKX-NEXT: kshiftlb $1, %k0, %k0
-; SKX-NEXT: vpextrw $0, %xmm0, %eax
-; SKX-NEXT: movzwl %ax, %eax
-; SKX-NEXT: vmovd %eax, %xmm0
+; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
; SKX-NEXT: vucomiss %xmm2, %xmm0
; SKX-NEXT: setb %al
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index f5cca7838bd87e..f3c728a990f514 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1436,10 +1436,9 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; KNL: ## %bb.0: ## %entry
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT: vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
-; KNL-NEXT: vpextrw $0, %xmm1, %eax ## encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
-; KNL-NEXT: movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
-; KNL-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; KNL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; KNL-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
+; KNL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; KNL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1449,9 +1448,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; KNL-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
; KNL-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
; KNL-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; KNL-NEXT: vpextrw $0, %xmm0, %edi ## encoding: [0xc5,0xf9,0xc5,0xf8,0x00]
-; KNL-NEXT: movzwl %di, %edi ## encoding: [0x0f,0xb7,0xff]
-; KNL-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; KNL-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; KNL-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; KNL-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
; KNL-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1468,10 +1466,9 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; AVX512BW: ## %bb.0: ## %entry
; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
-; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
-; AVX512BW-NEXT: movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
-; AVX512BW-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
+; AVX512BW-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; AVX512BW-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1481,9 +1478,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; AVX512BW-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
; AVX512BW-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
; AVX512BW-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, %edi ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xf8,0x00]
-; AVX512BW-NEXT: movzwl %di, %edi ## encoding: [0x0f,0xb7,0xff]
-; AVX512BW-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; AVX512BW-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; AVX512BW-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
; AVX512BW-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1500,10 +1496,9 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; SKX-NEXT: vpsrld $16, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xd0,0x10]
-; SKX-NEXT: vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
-; SKX-NEXT: movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
-; SKX-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; SKX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SKX-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
; SKX-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
@@ -1512,9 +1507,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
; SKX-NEXT: testb %cl, %cl ## encoding: [0x84,0xc9]
; SKX-NEXT: setne %al ## encoding: [0x0f,0x95,0xc0]
-; SKX-NEXT: vpextrw $0, %xmm0, %ecx ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc8,0x00]
-; SKX-NEXT: movzwl %cx, %ecx ## encoding: [0x0f,0xb7,0xc9]
-; SKX-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; SKX-NEXT: vpmovzxwq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; SKX-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; SKX-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
; SKX-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1]
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index c8708ea9b681fe..a3fb71f817ce47 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -699,34 +699,23 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
; AVX2-LABEL: stest_f16i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $0, %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpextrw $0, %xmm2, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvttss2si %xmm1, %rcx
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvttss2si %xmm2, %rax
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -849,9 +838,6 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
; AVX2-LABEL: utesth_f16i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $0, %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm2
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -860,37 +846,29 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpextrw $0, %xmm2, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vcvttss2si %xmm3, %rax
+; AVX2-NEXT: vmovq %rdx, %xmm3
; AVX2-NEXT: vcvttss2si %xmm2, %rcx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpextrw $0, %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: vmovd %edx, %xmm3
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vcvttss2si %xmm4, %rax
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm3, %rcx
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX2-NEXT: vcvttss2si %xmm2, %rcx
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
@@ -901,7 +879,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
@@ -1024,34 +1002,23 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
; AVX2-LABEL: ustest_f16i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $0, %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpextrw $0, %xmm2, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3347,34 +3314,23 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-LABEL: stest_f16i32_mm:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $0, %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpextrw $0, %xmm2, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvttss2si %xmm1, %rcx
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvttss2si %xmm2, %rax
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3495,9 +3451,6 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-LABEL: utesth_f16i32_mm:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $0, %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm2
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -3506,37 +3459,29 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpextrw $0, %xmm2, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vcvttss2si %xmm3, %rax
+; AVX2-NEXT: vmovq %rdx, %xmm3
; AVX2-NEXT: vcvttss2si %xmm2, %rcx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpextrw $0, %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: vmovd %edx, %xmm3
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: andq %rax, %rdx
-; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vcvttss2si %xmm4, %rax
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm3, %rcx
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX2-NEXT: vcvttss2si %xmm2, %rcx
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
@@ -3547,7 +3492,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1
@@ -3669,34 +3614,23 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-LABEL: ustest_f16i32_mm:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $0, %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpextrw $0, %xmm2, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index d0853fdc748d29..2e1322446032ff 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1614,15 +1614,10 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
;
; BWON-F16C-LABEL: maxnum_v8f16:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT: vpextrw $0, %xmm2, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm2
+; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT: vpshufb %xmm3, %xmm1, %xmm2
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT: vpextrw $0, %xmm3, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm3
+; BWON-F16C-NEXT: vpshufb %xmm3, %xmm0, %xmm3
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
; BWON-F16C-NEXT: vucomiss %xmm2, %xmm3
; BWON-F16C-NEXT: ja .LBB26_2
@@ -1630,15 +1625,10 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vmovaps %xmm2, %xmm3
; BWON-F16C-NEXT: .LBB26_2:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2
-; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; BWON-F16C-NEXT: vpextrw $0, %xmm3, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm3
+; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT: vpshufb %xmm4, %xmm1, %xmm3
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; BWON-F16C-NEXT: vpextrw $0, %xmm4, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm4
+; BWON-F16C-NEXT: vpshufb %xmm4, %xmm0, %xmm4
; BWON-F16C-NEXT: vcvtph2ps %xmm4, %xmm4
; BWON-F16C-NEXT: vucomiss %xmm3, %xmm4
; BWON-F16C-NEXT: ja .LBB26_4
@@ -1648,49 +1638,33 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vmovd %xmm2, %eax
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm4, %xmm2
; BWON-F16C-NEXT: vmovd %xmm2, %ecx
-; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT: vpextrw $0, %xmm2, %edx
-; BWON-F16C-NEXT: movzwl %dx, %edx
-; BWON-F16C-NEXT: vmovd %edx, %xmm2
-; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT: vpextrw $0, %xmm3, %edx
-; BWON-F16C-NEXT: movzwl %dx, %edx
-; BWON-F16C-NEXT: vmovd %edx, %xmm3
+; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vucomiss %xmm2, %xmm3
+; BWON-F16C-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT: vucomiss %xmm3, %xmm2
; BWON-F16C-NEXT: ja .LBB26_6
; BWON-F16C-NEXT: # %bb.5:
-; BWON-F16C-NEXT: vmovaps %xmm2, %xmm3
+; BWON-F16C-NEXT: vmovaps %xmm3, %xmm2
; BWON-F16C-NEXT: .LBB26_6:
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; BWON-F16C-NEXT: vmovd %xmm2, %edx
-; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; BWON-F16C-NEXT: vpextrw $0, %xmm2, %esi
-; BWON-F16C-NEXT: movzwl %si, %esi
-; BWON-F16C-NEXT: vmovd %esi, %xmm2
+; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm3
+; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; BWON-F16C-NEXT: vpextrw $0, %xmm3, %esi
-; BWON-F16C-NEXT: movzwl %si, %esi
-; BWON-F16C-NEXT: vmovd %esi, %xmm3
-; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vucomiss %xmm2, %xmm3
+; BWON-F16C-NEXT: vucomiss %xmm3, %xmm2
; BWON-F16C-NEXT: ja .LBB26_8
; BWON-F16C-NEXT: # %bb.7:
-; BWON-F16C-NEXT: vmovaps %xmm2, %xmm3
+; BWON-F16C-NEXT: vmovaps %xmm3, %xmm2
; BWON-F16C-NEXT: .LBB26_8:
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; BWON-F16C-NEXT: vmovd %xmm2, %esi
-; BWON-F16C-NEXT: vpsrlq $48, %xmm1, %xmm2
-; BWON-F16C-NEXT: vpextrw $0, %xmm2, %edi
-; BWON-F16C-NEXT: movzwl %di, %edi
-; BWON-F16C-NEXT: vmovd %edi, %xmm2
+; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT: vpshufb %xmm3, %xmm1, %xmm2
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: vpsrlq $48, %xmm0, %xmm3
-; BWON-F16C-NEXT: vpextrw $0, %xmm3, %edi
-; BWON-F16C-NEXT: movzwl %di, %edi
-; BWON-F16C-NEXT: vmovd %edi, %xmm3
+; BWON-F16C-NEXT: vpshufb %xmm3, %xmm0, %xmm3
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm6
; BWON-F16C-NEXT: vucomiss %xmm2, %xmm6
; BWON-F16C-NEXT: ja .LBB26_10
@@ -1703,54 +1677,39 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vpinsrw $0, %esi, %xmm0, %xmm5
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm6
; BWON-F16C-NEXT: vmovd %xmm6, %eax
-; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; BWON-F16C-NEXT: vpextrw $0, %xmm6, %ecx
-; BWON-F16C-NEXT: movzwl %cx, %ecx
-; BWON-F16C-NEXT: vmovd %ecx, %xmm6
+; BWON-F16C-NEXT: vpsrlq $48, %xmm1, %xmm6
+; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm7
+; BWON-F16C-NEXT: vpsrlq $48, %xmm0, %xmm6
; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6
-; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; BWON-F16C-NEXT: vpextrw $0, %xmm7, %ecx
-; BWON-F16C-NEXT: movzwl %cx, %ecx
-; BWON-F16C-NEXT: vmovd %ecx, %xmm7
-; BWON-F16C-NEXT: vcvtph2ps %xmm7, %xmm7
-; BWON-F16C-NEXT: vucomiss %xmm6, %xmm7
+; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6
; BWON-F16C-NEXT: ja .LBB26_12
; BWON-F16C-NEXT: # %bb.11:
-; BWON-F16C-NEXT: vmovaps %xmm6, %xmm7
+; BWON-F16C-NEXT: vmovaps %xmm7, %xmm6
; BWON-F16C-NEXT: .LBB26_12:
; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm7, %xmm5
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm5
; BWON-F16C-NEXT: vmovd %xmm5, %eax
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5
-; BWON-F16C-NEXT: vpextrw $0, %xmm1, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm6
-; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6
-; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm7
+; BWON-F16C-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; BWON-F16C-NEXT: vcvtph2ps %xmm7, %xmm7
-; BWON-F16C-NEXT: vucomiss %xmm6, %xmm7
+; BWON-F16C-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6
+; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6
; BWON-F16C-NEXT: ja .LBB26_14
; BWON-F16C-NEXT: # %bb.13:
-; BWON-F16C-NEXT: vmovaps %xmm6, %xmm7
+; BWON-F16C-NEXT: vmovaps %xmm7, %xmm6
; BWON-F16C-NEXT: .LBB26_14:
-; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm7, %xmm4
+; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm4
; BWON-F16C-NEXT: vmovd %xmm4, %eax
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
-; BWON-F16C-NEXT: vpsrld $16, %xmm1, %xmm1
-; BWON-F16C-NEXT: vpextrw $0, %xmm1, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: vpsrld $16, %xmm0, %xmm0
-; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT: movzwl %ax, %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0
; BWON-F16C-NEXT: ja .LBB26_16
@@ -1760,7 +1719,7 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; BWON-F16C-NEXT: vmovd %xmm0, %eax
; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; BWON-F16C-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; BWON-F16C-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index fa1014e3ae0da6..a21653bc7330c9 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -41,15 +41,11 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
;
; F16C-LABEL: ir_fadd_v1f16:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vpextrw $0, %xmm1, %ecx
-; F16C-NEXT: movzwl %cx, %ecx
-; F16C-NEXT: vmovd %ecx, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: movzwl %ax, %eax
-; F16C-NEXT: vmovd %eax, %xmm1
+; F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index f373fb9af22b37..95f839c338e701 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -5,54 +5,43 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-LABEL: main.41:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastw (%rax), %xmm0
-; CHECK-NEXT: vmovdqu (%rax), %ymm2
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu (%rax), %xmm5
-; CHECK-NEXT: vpextrw $0, %xmm5, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
+; CHECK-NEXT: vmovdqu (%rax), %ymm1
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
+; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vmovdqu (%rax), %xmm10
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT: vpshufb %xmm1, %xmm10, %xmm2
; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm0, %xmm2
+; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm4
+; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
+; CHECK-NEXT: vucomiss %xmm4, %xmm2
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
-; CHECK-NEXT: vpsrld $16, %xmm1, %xmm3
-; CHECK-NEXT: vpextrw $0, %xmm3, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm3
-; CHECK-NEXT: vpsrld $16, %xmm5, %xmm4
-; CHECK-NEXT: vpextrw $0, %xmm4, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm4
; CHECK-NEXT: setne %al
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vcvtph2ps %xmm3, %xmm6
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm3
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: vucomiss %xmm6, %xmm3
+; CHECK-NEXT: kmovd %eax, %k0
+; CHECK-NEXT: kshiftlw $15, %k0, %k0
+; CHECK-NEXT: kshiftrw $14, %k0, %k0
+; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm4, %xmm11
+; CHECK-NEXT: vucomiss %xmm0, %xmm11
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
; CHECK-NEXT: setne %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: kshiftlw $15, %k1, %k1
-; CHECK-NEXT: kshiftrw $14, %k1, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: korw %k0, %k1, %k0
; CHECK-NEXT: movw $-5, %ax
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vprolq $32, %xmm1, %xmm4
-; CHECK-NEXT: vpextrw $0, %xmm4, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm4
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT: vucomiss %xmm4, %xmm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
+; CHECK-NEXT: vucomiss %xmm5, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -63,18 +52,12 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-9, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm4
-; CHECK-NEXT: vpextrw $0, %xmm4, %eax
+; CHECK-NEXT: vpsrlq $48, %xmm3, %xmm5
+; CHECK-NEXT: vcvtph2ps %xmm5, %xmm6
+; CHECK-NEXT: vpsrlq $48, %xmm10, %xmm5
+; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm4
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm6
-; CHECK-NEXT: vpsrlq $48, %xmm5, %xmm4
-; CHECK-NEXT: vpextrw $0, %xmm4, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm4
-; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT: vucomiss %xmm6, %xmm4
+; CHECK-NEXT: vucomiss %xmm6, %xmm5
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -85,13 +68,11 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-17, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
-; CHECK-NEXT: vpextrw $0, %xmm6, %eax
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm6
-; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6
-; CHECK-NEXT: vucomiss %xmm6, %xmm0
+; CHECK-NEXT: vpshufb %xmm6, %xmm3, %xmm7
+; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
+; CHECK-NEXT: vucomiss %xmm7, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -102,18 +83,13 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-33, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpextrw $0, %xmm6, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm6
-; CHECK-NEXT: vcvtph2ps %xmm6, %xmm7
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpextrw $0, %xmm6, %eax
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT: vpshufb %xmm7, %xmm10, %xmm8
+; CHECK-NEXT: vcvtph2ps %xmm8, %xmm8
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm6
-; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6
-; CHECK-NEXT: vucomiss %xmm7, %xmm6
+; CHECK-NEXT: vpshufb %xmm7, %xmm3, %xmm9
+; CHECK-NEXT: vcvtph2ps %xmm9, %xmm9
+; CHECK-NEXT: vucomiss %xmm9, %xmm8
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -124,13 +100,11 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-65, %ax
; CHECK-NEXT: kmovd %eax, %k1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT: vpshufb %xmm9, %xmm3, %xmm12
+; CHECK-NEXT: vcvtph2ps %xmm12, %xmm12
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; CHECK-NEXT: vpextrw $0, %xmm7, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm7
-; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT: vucomiss %xmm7, %xmm0
+; CHECK-NEXT: vucomiss %xmm12, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -142,17 +116,11 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-129, %ax
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpextrw $0, %xmm7, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm7
-; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpextrw $0, %xmm5, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm5
-; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5
-; CHECK-NEXT: vucomiss %xmm7, %xmm5
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm12 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm12, %xmm12
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm10, %xmm10
+; CHECK-NEXT: vucomiss %xmm12, %xmm10
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -163,13 +131,11 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-257, %ax # imm = 0xFEFF
; CHECK-NEXT: kmovd %eax, %k1
+; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm3
+; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; CHECK-NEXT: vcvtph2ps %xmm12, %xmm12
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vpextrw $0, %xmm1, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm7
-; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT: vucomiss %xmm7, %xmm2
+; CHECK-NEXT: vucomiss %xmm12, %xmm11
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -181,12 +147,9 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-513, %ax # imm = 0xFDFF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2
-; CHECK-NEXT: vpextrw $0, %xmm2, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm2, %xmm3
+; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT: vucomiss %xmm1, %xmm2
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -197,13 +160,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-1025, %ax # imm = 0xFBFF
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vprolq $32, %xmm1, %xmm2
-; CHECK-NEXT: vpextrw $0, %xmm2, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm2, %xmm0
+; CHECK-NEXT: vpshufb %xmm4, %xmm3, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -215,12 +175,9 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-2049, %ax # imm = 0xF7FF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm2
-; CHECK-NEXT: vpextrw $0, %xmm2, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm2, %xmm4
+; CHECK-NEXT: vpsrlq $48, %xmm3, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT: vucomiss %xmm1, %xmm5
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -231,13 +188,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-4097, %ax # imm = 0xEFFF
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; CHECK-NEXT: vpextrw $0, %xmm2, %eax
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm2, %xmm0
+; CHECK-NEXT: vpshufb %xmm6, %xmm3, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -249,12 +203,9 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: movw $-8193, %ax # imm = 0xDFFF
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpextrw $0, %xmm2, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm2, %xmm6
+; CHECK-NEXT: vpshufb %xmm7, %xmm3, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT: vucomiss %xmm1, %xmm8
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -265,13 +216,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: movw $-16385, %ax # imm = 0xBFFF
; CHECK-NEXT: kmovd %eax, %k1
+; CHECK-NEXT: vpshufb %xmm9, %xmm3, %xmm1
+; CHECK-NEXT: vcvtph2ps %xmm1, %xmm1
; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; CHECK-NEXT: vpextrw $0, %xmm2, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm2
-; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT: vucomiss %xmm2, %xmm0
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
@@ -280,13 +228,10 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: kshiftlw $14, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kshiftlw $1, %k0, %k0
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-NEXT: kshiftrw $1, %k0, %k0
-; CHECK-NEXT: vucomiss %xmm0, %xmm5
+; CHECK-NEXT: vucomiss %xmm0, %xmm10
; CHECK-NEXT: setnp %al
; CHECK-NEXT: sete %cl
; CHECK-NEXT: testb %al, %cl
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index f59960f06f4a11..3b82df5d5b74d2 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -4976,32 +4976,22 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
;
; F16C-LABEL: fptosi_2f16_to_4i32:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: movzwl %ax, %eax
-; F16C-NEXT: vmovd %eax, %xmm1
+; F16C-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT: vpsrld $16, %xmm0, %xmm0
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: movzwl %ax, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; F16C-NEXT: retq
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512-NEXT: movzwl %ax, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512-NEXT: movzwl %ax, %eax
-; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index 71c4427da96253..6e8eefc607ee11 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -413,13 +413,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT: vpextrw $0, %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: vucomiss %xmm3, %xmm2
@@ -434,13 +430,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512VL-LABEL: test_v2f16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vpextrw $0, %xmm1, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: xorl %eax, %eax
; AVX512VL-NEXT: vucomiss %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 0b2f9d69f0623c..804ca183ad4c9d 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -412,13 +412,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT: vpextrw $0, %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: vucomiss %xmm3, %xmm2
@@ -433,13 +429,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; AVX512VL-LABEL: test_v2f16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vpextrw $0, %xmm1, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: xorl %eax, %eax
; AVX512VL-NEXT: vucomiss %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 468fec66c028b7..6360c68e62cc94 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2012,24 +2012,18 @@ define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
; SSE2-LABEL: extract3_insert0_v4i32_7123:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: extract3_insert0_v4i32_7123:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE3-NEXT: movd %xmm1, %eax
-; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: extract3_insert0_v4i32_7123:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSSE3-NEXT: movd %xmm1, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSSE3-NEXT: retq
;
More information about the llvm-commits
mailing list