[llvm] r368107 - Revert "[X86] Enable -x86-experimental-vector-widening-legalization by default."
Mitch Phillips via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 6 16:00:44 PDT 2019
Modified: llvm/trunk/test/CodeGen/X86/oddshuffles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddshuffles.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddshuffles.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll Tue Aug 6 16:00:43 2019
@@ -68,7 +68,7 @@ define void @v3f64(<2 x double> %a, <2 x
define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
; SSE2-LABEL: v3i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movd %xmm2, 8(%rdi)
; SSE2-NEXT: movq %xmm0, (%rdi)
@@ -76,7 +76,7 @@ define void @v3i32(<2 x i32> %a, <2 x i3
;
; SSE42-LABEL: v3i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi)
+; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE42-NEXT: movlps %xmm0, (%rdi)
; SSE42-NEXT: retq
@@ -84,14 +84,14 @@ define void @v3i32(<2 x i32> %a, <2 x i3
; AVX-LABEL: v3i32:
; AVX: # %bb.0:
; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi)
+; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovlps %xmm1, (%rdi)
; AVX-NEXT: retq
;
; XOP-LABEL: v3i32:
; XOP: # %bb.0:
; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; XOP-NEXT: vextractps $1, %xmm0, 8(%rdi)
+; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovlps %xmm1, (%rdi)
; XOP-NEXT: retq
%r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1>
@@ -102,34 +102,58 @@ define void @v3i32(<2 x i32> %a, <2 x i3
define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; SSE2-LABEL: v5i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psrlq $16, %xmm1
-; SSE2-NEXT: pextrw $3, %xmm0, %eax
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: pextrw $6, %xmm0, %eax
; SSE2-NEXT: movw %ax, 8(%rdi)
-; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movq %xmm2, (%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v5i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: psrlq $16, %xmm1
-; SSE42-NEXT: pextrw $3, %xmm0, 8(%rdi)
-; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE42-NEXT: movq %xmm0, (%rdi)
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE42-NEXT: pextrw $6, %xmm0, 8(%rdi)
+; SSE42-NEXT: movq %xmm2, (%rdi)
; SSE42-NEXT: retq
;
-; AVX-LABEL: v5i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vpextrw $3, %xmm0, 8(%rdi)
-; AVX-NEXT: vmovq %xmm1, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: v5i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: v5i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: v5i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v5i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsrlq $16, %xmm1, %xmm1
-; XOP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; XOP-NEXT: vpextrw $3, %xmm0, 8(%rdi)
+; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[4,5],xmm1[4,5],xmm0[6,7],xmm1[6,7]
+; XOP-NEXT: vpextrw $6, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm1, (%rdi)
; XOP-NEXT: retq
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
@@ -227,35 +251,42 @@ define void @v5f32(<4 x float> %a, <4 x
define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
; SSE2-LABEL: v7i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: movb %al, 6(%rdi)
-; SSE2-NEXT: movd %xmm2, (%rdi)
-; SSE2-NEXT: pextrw $2, %xmm2, %eax
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: pextrw $2, %xmm0, %eax
; SSE2-NEXT: movw %ax, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i8:
; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi)
-; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi)
; SSE42-NEXT: movd %xmm1, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v7i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi)
; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX-NEXT: vmovd %xmm0, (%rdi)
@@ -263,7 +294,7 @@ define void @v7i8(<4 x i8> %a, <4 x i8>
;
; XOP-LABEL: v7i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[3],xmm1[2],xmm0[1],xmm1[3,0,u,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi)
; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; XOP-NEXT: vmovd %xmm0, (%rdi)
@@ -276,32 +307,36 @@ define void @v7i8(<4 x i8> %a, <4 x i8>
define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; SSE2-LABEL: v7i16:
; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: movw %ax, 12(%rdi)
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movq %xmm2, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i16:
; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi)
-; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi)
; SSE42-NEXT: movq %xmm1, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v7i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
; AVX-NEXT: vpextrw $0, %xmm1, 12(%rdi)
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
@@ -309,7 +344,7 @@ define void @v7i16(<4 x i16> %a, <4 x i1
;
; XOP-LABEL: v7i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[4,5],xmm0[6,7],xmm1[4,5],xmm0[2,3],xmm1[6,7,0,1],xmm0[6,7]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15]
; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi)
; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm0, (%rdi)
@@ -375,19 +410,19 @@ define void @v7i32(<4 x i32> %a, <4 x i3
define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
; SSE2-LABEL: v12i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
+; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,3]
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movq %xmm2, (%rdi)
@@ -397,23 +432,27 @@ define void @v12i8(<8 x i8> %a, <8 x i8>
;
; SSE42-LABEL: v12i8:
; SSE42: # %bb.0:
-; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
+; SSE42-NEXT: por %xmm1, %xmm0
; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
; SSE42-NEXT: movq %xmm0, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v12i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; XOP-LABEL: v12i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm1[0],xmm0[1,5],xmm1[1],xmm0[2,6],xmm1[2],xmm0[3,7],xmm1[3],xmm0[u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm0, (%rdi)
; XOP-NEXT: retq
@@ -620,7 +659,11 @@ define void @v12i32(<8 x i32> %a, <8 x i
define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind {
; SSE2-LABEL: pr29025:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
@@ -632,6 +675,9 @@ define void @pr29025(<4 x i8> %a, <4 x i
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3]
; SSE2-NEXT: pandn %xmm2, %xmm1
@@ -643,7 +689,11 @@ define void @pr29025(<4 x i8> %a, <4 x i
;
; SSE42-LABEL: pr29025:
; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm3, %xmm1
+; SSE42-NEXT: pshufb %xmm3, %xmm0
; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE42-NEXT: pshufb %xmm3, %xmm2
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
@@ -652,8 +702,12 @@ define void @pr29025(<4 x i8> %a, <4 x i
;
; AVX-LABEL: pr29025:
; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
@@ -661,8 +715,8 @@ define void @pr29025(<4 x i8> %a, <4 x i
;
; XOP-LABEL: pr29025:
; XOP: # %bb.0:
-; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[1],xmm0[2,6],xmm2[2],xmm0[3,7],xmm2[3],xmm0[u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u]
; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm0, (%rdi)
; XOP-NEXT: retq
@@ -697,39 +751,39 @@ define void @interleave_24i8_out(<24 x i
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm6
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; SSE2-NEXT: packuswb %xmm0, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT: movq %xmm4, (%rsi)
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; SSE2-NEXT: packuswb %xmm0, %xmm4
+; SSE2-NEXT: movq %xmm4, (%rdx)
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm4
; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm4, (%rsi)
-; SSE2-NEXT: movq %xmm5, (%rdx)
; SSE2-NEXT: movq %xmm0, (%rcx)
; SSE2-NEXT: retq
;
@@ -742,16 +796,16 @@ define void @interleave_24i8_out(<24 x i
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; SSE42-NEXT: por %xmm2, %xmm3
+; SSE42-NEXT: movq %xmm3, (%rsi)
; SSE42-NEXT: movdqa %xmm1, %xmm2
; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movdqa %xmm0, %xmm4
-; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u]
-; SSE42-NEXT: por %xmm2, %xmm4
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; SSE42-NEXT: por %xmm2, %xmm3
+; SSE42-NEXT: movq %xmm3, (%rdx)
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSE42-NEXT: por %xmm1, %xmm0
-; SSE42-NEXT: movq %xmm3, (%rsi)
-; SSE42-NEXT: movq %xmm4, (%rdx)
; SSE42-NEXT: movq %xmm0, (%rcx)
; SSE42-NEXT: retq
;
@@ -762,14 +816,14 @@ define void @interleave_24i8_out(<24 x i
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vmovq %xmm2, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vmovq %xmm2, (%rdx)
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm2, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
; AVX-NEXT: vmovq %xmm0, (%rcx)
; AVX-NEXT: retq
;
@@ -780,14 +834,14 @@ define void @interleave_24i8_out(<24 x i
; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; XOP-NEXT: vmovq %xmm2, (%rsi)
+; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vmovq %xmm2, (%rdx)
; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vmovq %xmm2, (%rsi)
-; XOP-NEXT: vmovq %xmm3, (%rdx)
; XOP-NEXT: vmovq %xmm0, (%rcx)
; XOP-NEXT: retq
%wide.vec = load <24 x i8>, <24 x i8>* %p, align 4
Modified: llvm/trunk/test/CodeGen/X86/oddsubvector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddsubvector.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddsubvector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddsubvector.ll Tue Aug 6 16:00:43 2019
@@ -12,12 +12,19 @@ define void @insert_v7i8_v2i16_2(<7 x i8
; SSE2-LABEL: insert_v7i8_v2i16_2:
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: pextrw $3, %xmm1, %eax
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd %xmm1, (%rdi)
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: andps %xmm0, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: movb %al, 6(%rdi)
-; SSE2-NEXT: pextrw $1, %xmm0, %eax
+; SSE2-NEXT: movd %xmm1, (%rdi)
+; SSE2-NEXT: pextrw $2, %xmm1, %eax
; SSE2-NEXT: movw %ax, 4(%rdi)
; SSE2-NEXT: retq
;
@@ -25,40 +32,52 @@ define void @insert_v7i8_v2i16_2(<7 x i8
; SSE42: # %bb.0:
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
+; SSE42-NEXT: packuswb %xmm0, %xmm0
; SSE42-NEXT: pextrb $6, %xmm1, 6(%rdi)
-; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE42-NEXT: pextrw $1, %xmm0, 4(%rdi)
-; SSE42-NEXT: movd %xmm1, (%rdi)
+; SSE42-NEXT: pextrw $2, %xmm0, 4(%rdi)
+; SSE42-NEXT: movd %xmm0, (%rdi)
; SSE42-NEXT: retq
;
; AVX1-LABEL: insert_v7i8_v2i16_2:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi)
-; AVX1-NEXT: vpextrw $1, %xmm0, 4(%rdi)
-; AVX1-NEXT: vmovd %xmm2, (%rdi)
+; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; AVX1-NEXT: vmovd %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v7i8_v2i16_2:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3]
+; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi)
-; AVX2-NEXT: vpextrw $1, %xmm0, 4(%rdi)
-; AVX2-NEXT: vmovd %xmm2, (%rdi)
+; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; AVX2-NEXT: vmovd %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: insert_v7i8_v2i16_2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3]
+; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $6, %xmm1, 6(%rdi)
-; AVX512-NEXT: vpextrw $1, %xmm0, 4(%rdi)
-; AVX512-NEXT: vmovd %xmm2, (%rdi)
+; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
;
; XOP-LABEL: insert_v7i8_v2i16_2:
Modified: llvm/trunk/test/CodeGen/X86/pmaddubsw.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmaddubsw.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmaddubsw.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmaddubsw.ll Tue Aug 6 16:00:43 2019
@@ -324,23 +324,23 @@ define <8 x i16> @pmaddubsw_bad_extend(<
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
-; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm3
+; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -355,14 +355,14 @@ define <8 x i16> @pmaddubsw_bad_extend(<
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
@@ -377,14 +377,14 @@ define <8 x i16> @pmaddubsw_bad_extend(<
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
@@ -452,23 +452,23 @@ define <8 x i16> @pmaddubsw_bad_indices(
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -481,13 +481,13 @@ define <8 x i16> @pmaddubsw_bad_indices(
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
@@ -501,13 +501,13 @@ define <8 x i16> @pmaddubsw_bad_indices(
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
Modified: llvm/trunk/test/CodeGen/X86/pmulh.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmulh.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmulh.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmulh.ll Tue Aug 6 16:00:43 2019
@@ -8,14 +8,45 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
-; SSE-LABEL: mulhuw_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pmulhuw %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-PROMOTE-LABEL: mulhuw_v4i16:
+; SSE2-PROMOTE: # %bb.0:
+; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-PROMOTE-NEXT: pmulhuw %xmm1, %xmm0
+; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1
+; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-PROMOTE-NEXT: retq
+;
+; SSE2-WIDEN-LABEL: mulhuw_v4i16:
+; SSE2-WIDEN: # %bb.0:
+; SSE2-WIDEN-NEXT: pmulhuw %xmm1, %xmm0
+; SSE2-WIDEN-NEXT: retq
+;
+; SSE41-PROMOTE-LABEL: mulhuw_v4i16:
+; SSE41-PROMOTE: # %bb.0:
+; SSE41-PROMOTE-NEXT: pxor %xmm2, %xmm2
+; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0
+; SSE41-PROMOTE-NEXT: psrld $16, %xmm0
+; SSE41-PROMOTE-NEXT: retq
+;
+; SSE41-WIDEN-LABEL: mulhuw_v4i16:
+; SSE41-WIDEN: # %bb.0:
+; SSE41-WIDEN-NEXT: pmulhuw %xmm1, %xmm0
+; SSE41-WIDEN-NEXT: retq
;
; AVX-LABEL: mulhuw_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = zext <4 x i16> %a to <4 x i32>
%b1 = zext <4 x i16> %b to <4 x i32>
@@ -26,14 +57,47 @@ define <4 x i16> @mulhuw_v4i16(<4 x i16>
}
define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
-; SSE-LABEL: mulhw_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pmulhw %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-PROMOTE-LABEL: mulhw_v4i16:
+; SSE2-PROMOTE: # %bb.0:
+; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-PROMOTE-NEXT: pmulhw %xmm1, %xmm0
+; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1
+; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-PROMOTE-NEXT: retq
+;
+; SSE2-WIDEN-LABEL: mulhw_v4i16:
+; SSE2-WIDEN: # %bb.0:
+; SSE2-WIDEN-NEXT: pmulhw %xmm1, %xmm0
+; SSE2-WIDEN-NEXT: retq
+;
+; SSE41-PROMOTE-LABEL: mulhw_v4i16:
+; SSE41-PROMOTE: # %bb.0:
+; SSE41-PROMOTE-NEXT: pslld $16, %xmm0
+; SSE41-PROMOTE-NEXT: psrad $16, %xmm0
+; SSE41-PROMOTE-NEXT: pslld $16, %xmm1
+; SSE41-PROMOTE-NEXT: psrad $16, %xmm1
+; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0
+; SSE41-PROMOTE-NEXT: psrld $16, %xmm0
+; SSE41-PROMOTE-NEXT: retq
+;
+; SSE41-WIDEN-LABEL: mulhw_v4i16:
+; SSE41-WIDEN: # %bb.0:
+; SSE41-WIDEN-NEXT: pmulhw %xmm1, %xmm0
+; SSE41-WIDEN-NEXT: retq
;
; AVX-LABEL: mulhw_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = sext <4 x i16> %a to <4 x i32>
%b1 = sext <4 x i16> %b to <4 x i32>
Modified: llvm/trunk/test/CodeGen/X86/pointer-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pointer-vector.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pointer-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pointer-vector.ll Tue Aug 6 16:00:43 2019
@@ -117,7 +117,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %
; CHECK-LABEL: BITCAST1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; CHECK-NEXT: retl
entry:
%G = load <2 x i8*>, <2 x i8*>* %p
Modified: llvm/trunk/test/CodeGen/X86/pr14161.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr14161.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr14161.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr14161.ll Tue Aug 6 16:00:43 2019
@@ -7,6 +7,7 @@ define <2 x i16> @good(<4 x i32>*, <4 x
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pmovzxwq %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%2 = load <4 x i32>, <4 x i32>* %0, align 16
@@ -26,6 +27,7 @@ define <2 x i16> @bad(<4 x i32>*, <4 x i
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-NEXT: retq
entry:
%2 = load <4 x i32>, <4 x i32>* %0, align 16
Modified: llvm/trunk/test/CodeGen/X86/pr35918.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr35918.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr35918.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr35918.ll Tue Aug 6 16:00:43 2019
@@ -5,31 +5,79 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X64,X64-SKX
define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind {
-; X86-LABEL: fetch_r16g16_snorm_unorm8:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpsrlw $7, %xmm0, %xmm0
-; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
-; X86-NEXT: vmovd %xmm0, %ecx
-; X86-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: retl
+; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
+; X86-SKYLAKE: # %bb.0: # %entry
+; X86-SKYLAKE-NEXT: subl $8, %esp
+; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx
+; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; X86-SKYLAKE-NEXT: movl %ecx, (%eax)
+; X86-SKYLAKE-NEXT: addl $8, %esp
+; X86-SKYLAKE-NEXT: retl
;
-; X64-LABEL: fetch_r16g16_snorm_unorm8:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpsrlw $7, %xmm0, %xmm0
-; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; X64-NEXT: movl %eax, (%rdi)
-; X64-NEXT: retq
+; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8:
+; X86-SKX: # %bb.0: # %entry
+; X86-SKX-NEXT: subl $8, %esp
+; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
+; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
+; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
+; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp)
+; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-SKX-NEXT: vpmovdb %xmm0, (%esp)
+; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT: movzwl (%esp), %ecx
+; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; X86-SKX-NEXT: movl %ecx, (%eax)
+; X86-SKX-NEXT: addl $8, %esp
+; X86-SKX-NEXT: retl
+;
+; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
+; X64-SKYLAKE: # %bb.0: # %entry
+; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax
+; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000
+; X64-SKYLAKE-NEXT: movl %eax, (%rdi)
+; X64-SKYLAKE-NEXT: retq
+;
+; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8:
+; X64-SKX: # %bb.0: # %entry
+; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
+; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
+; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
+; X64-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000
+; X64-SKX-NEXT: movl %eax, (%rdi)
+; X64-SKX-NEXT: retq
entry:
%5 = bitcast i8* %1 to <2 x i16>*
%6 = load <2 x i16>, <2 x i16>* %5, align 2
Modified: llvm/trunk/test/CodeGen/X86/pr40994.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr40994.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr40994.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr40994.ll Tue Aug 6 16:00:43 2019
@@ -12,7 +12,7 @@ define <8 x i8> @foo(<16 x i8> %a) {
; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; CHECK-NEXT: retq
%v = alloca i8, i32 8, align 16
call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
Modified: llvm/trunk/test/CodeGen/X86/promote-vec3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/promote-vec3.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/promote-vec3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/promote-vec3.ll Tue Aug 6 16:00:43 2019
@@ -8,13 +8,11 @@
define <3 x i16> @zext_i8(<3 x i8>) {
; SSE3-LABEL: zext_i8:
; SSE3: # %bb.0:
+; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: pinsrw $1, %edx, %xmm0
-; SSE3-NEXT: pinsrw $2, %ecx, %xmm0
-; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $0, %xmm0, %eax
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
; SSE3-NEXT: # kill: def $dx killed $dx killed $edx
; SSE3-NEXT: # kill: def $cx killed $cx killed $ecx
@@ -22,13 +20,13 @@ define <3 x i16> @zext_i8(<3 x i8>) {
;
; SSE41-LABEL: zext_i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pextrw $2, %xmm0, %edx
+; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: pextrw $1, %xmm0, %edx
-; SSE41-NEXT: pextrw $2, %xmm0, %ecx
+; SSE41-NEXT: pextrw $4, %xmm0, %ecx
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx
@@ -36,13 +34,13 @@ define <3 x i16> @zext_i8(<3 x i8>) {
;
; AVX-32-LABEL: zext_i8:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-32-NEXT: vmovd %xmm0, %eax
-; AVX-32-NEXT: vpextrw $1, %xmm0, %edx
-; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-32-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; AVX-32-NEXT: vpextrw $2, %xmm0, %edx
+; AVX-32-NEXT: vmovd %xmm1, %eax
+; AVX-32-NEXT: vpextrw $4, %xmm1, %ecx
; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx
@@ -51,12 +49,12 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; AVX-64-LABEL: zext_i8:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
-; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
-; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-64-NEXT: vmovd %xmm0, %eax
-; AVX-64-NEXT: vpextrw $1, %xmm0, %edx
-; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-64-NEXT: vpextrw $2, %xmm0, %edx
+; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx
; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx
@@ -69,16 +67,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; SSE3-LABEL: sext_i8:
; SSE3: # %bb.0:
; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: shll $8, %eax
-; SSE3-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; SSE3-NEXT: shll $8, %ecx
-; SSE3-NEXT: movd %ecx, %xmm0
+; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: shll $8, %eax
; SSE3-NEXT: pinsrw $2, %eax, %xmm0
+; SSE3-NEXT: psllw $8, %xmm0
; SSE3-NEXT: psraw $8, %xmm0
-; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $0, %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %edx
; SSE3-NEXT: pextrw $2, %xmm0, %ecx
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
@@ -89,12 +84,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; SSE41-LABEL: sext_i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pslld $24, %xmm0
+; SSE41-NEXT: psrad $24, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: pextrw $1, %xmm0, %edx
-; SSE41-NEXT: pextrw $2, %xmm0, %ecx
+; SSE41-NEXT: pextrw $2, %xmm0, %edx
+; SSE41-NEXT: pextrw $4, %xmm0, %ecx
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx
@@ -103,12 +99,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; AVX-32-LABEL: sext_i8:
; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX-32-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX-32-NEXT: vmovd %xmm0, %eax
-; AVX-32-NEXT: vpextrw $1, %xmm0, %edx
-; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-32-NEXT: vpextrw $2, %xmm0, %edx
+; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx
; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx
@@ -117,12 +114,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; AVX-64-LABEL: sext_i8:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
-; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
-; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX-64-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX-64-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX-64-NEXT: vmovd %xmm0, %eax
-; AVX-64-NEXT: vpextrw $1, %xmm0, %edx
-; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-64-NEXT: vpextrw $2, %xmm0, %edx
+; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx
; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx
Modified: llvm/trunk/test/CodeGen/X86/promote.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/promote.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/promote.ll (original)
+++ llvm/trunk/test/CodeGen/X86/promote.ll Tue Aug 6 16:00:43 2019
@@ -6,19 +6,18 @@ define i32 @mul_f(<4 x i8>* %A) {
; X86-LABEL: mul_f:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT: pmullw %xmm0, %xmm0
-; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-NEXT: pmaddwd %xmm0, %xmm0
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: mul_f:
; X64: # %bb.0: # %entry
-; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-NEXT: pmullw %xmm0, %xmm0
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-NEXT: pmaddwd %xmm0, %xmm0
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rax)
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
@@ -33,16 +32,18 @@ define i32 @shuff_f(<4 x i8>* %A) {
; X86-LABEL: shuff_f:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: paddb %xmm0, %xmm0
+; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-NEXT: paddd %xmm0, %xmm0
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: shuff_f:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: paddb %xmm0, %xmm0
+; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-NEXT: paddd %xmm0, %xmm0
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rax)
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/psubus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/psubus.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/psubus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/psubus.ll Tue Aug 6 16:00:43 2019
@@ -2203,16 +2203,16 @@ define void @subus_v8i8(<8 x i8>* %p1, <
define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) {
; SSE-LABEL: subus_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: psubusb %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: subus_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -2228,8 +2228,8 @@ define void @subus_v4i8(<4 x i8>* %p1, <
define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) {
; SSE2-LABEL: subus_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: psubusb %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
@@ -2237,8 +2237,8 @@ define void @subus_v2i8(<2 x i8>* %p1, <
;
; SSSE3-LABEL: subus_v2i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: psubusb %xmm1, %xmm0
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
@@ -2246,16 +2246,16 @@ define void @subus_v2i8(<2 x i8>* %p1, <
;
; SSE41-LABEL: subus_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: psubusb %xmm1, %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: subus_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -2296,16 +2296,16 @@ define void @subus_v4i16(<4 x i16>* %p1,
define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
; SSE-LABEL: subus_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: psubusw %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: subus_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/ret-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ret-mmx.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ret-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ret-mmx.ll Tue Aug 6 16:00:43 2019
@@ -33,7 +33,7 @@ define <2 x i32> @t3() nounwind {
; CHECK-LABEL: t3:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: retq
ret <2 x i32> <i32 1, i32 0>
}
Modified: llvm/trunk/test/CodeGen/X86/sad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sad.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sad.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sad.ll Tue Aug 6 16:00:43 2019
@@ -1074,13 +1074,12 @@ define i32 @sad_2i8() nounwind {
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psadbw %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB3_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
@@ -1097,13 +1096,12 @@ define i32 @sad_2i8() nounwind {
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX-NEXT: addq $4, %rax
; AVX-NEXT: jne .LBB3_1
; AVX-NEXT: # %bb.2: # %middle.block
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sadd_sat_vec.ll Tue Aug 6 16:00:43 2019
@@ -569,133 +569,240 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: paddq %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: paddd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: psrld $1, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm2, %xmm4
; SSSE3-NEXT: pandn %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: pandn %xmm4, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm1, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm4
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm1, %xmm5
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
+; SSE41-NEXT: pandn %xmm4, %xmm1
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
Modified: llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll Tue Aug 6 16:00:43 2019
@@ -147,34 +147,30 @@ define <5 x i16> @test_short_div(<5 x i1
define <4 x i16> @test_ushort_div(<4 x i16> %num, <4 x i16> %div) {
; CHECK-LABEL: test_ushort_div:
; CHECK: # %bb.0:
-; CHECK-NEXT: pextrw $1, %xmm0, %eax
-; CHECK-NEXT: pextrw $1, %xmm1, %ecx
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %ecx
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divw %cx
+; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: movd %xmm1, %esi
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divw %si
-; CHECK-NEXT: # kill: def $ax killed $ax def $eax
+; CHECK-NEXT: divl %esi
; CHECK-NEXT: movd %eax, %xmm2
-; CHECK-NEXT: pinsrw $1, %ecx, %xmm2
-; CHECK-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-NEXT: pextrw $2, %xmm1, %ecx
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divw %cx
-; CHECK-NEXT: # kill: def $ax killed $ax def $eax
-; CHECK-NEXT: pinsrw $2, %eax, %xmm2
-; CHECK-NEXT: pextrw $3, %xmm0, %eax
-; CHECK-NEXT: pextrw $3, %xmm1, %ecx
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divw %cx
-; CHECK-NEXT: # kill: def $ax killed $ax def $eax
-; CHECK-NEXT: pinsrw $3, %eax, %xmm2
+; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: pinsrd $2, %eax, %xmm2
+; CHECK-NEXT: pextrd $3, %xmm0, %eax
+; CHECK-NEXT: pextrd $3, %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: pinsrd $3, %eax, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%div.r = udiv <4 x i16> %num, %div
@@ -261,34 +257,31 @@ define <3 x i64> @test_ulong_div(<3 x i6
define <4 x i8> @test_char_rem(<4 x i8> %num, <4 x i8> %rem) {
; CHECK-LABEL: test_char_rem:
; CHECK: # %bb.0:
-; CHECK-NEXT: pextrb $1, %xmm0, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: cbtw
-; CHECK-NEXT: pextrb $1, %xmm1, %ecx
-; CHECK-NEXT: idivb %cl
-; CHECK-NEXT: movsbl %ah, %ecx
-; CHECK-NEXT: pextrb $0, %xmm0, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: cbtw
-; CHECK-NEXT: pextrb $0, %xmm1, %edx
-; CHECK-NEXT: idivb %dl
-; CHECK-NEXT: movsbl %ah, %eax
-; CHECK-NEXT: movd %eax, %xmm2
-; CHECK-NEXT: pextrb $2, %xmm0, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: cbtw
-; CHECK-NEXT: pinsrb $1, %ecx, %xmm2
-; CHECK-NEXT: pextrb $2, %xmm1, %ecx
-; CHECK-NEXT: idivb %cl
-; CHECK-NEXT: movsbl %ah, %ecx
-; CHECK-NEXT: pextrb $3, %xmm0, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: cbtw
-; CHECK-NEXT: pinsrb $2, %ecx, %xmm2
-; CHECK-NEXT: pextrb $3, %xmm1, %ecx
-; CHECK-NEXT: idivb %cl
-; CHECK-NEXT: movsbl %ah, %eax
-; CHECK-NEXT: pinsrb $3, %eax, %xmm2
+; CHECK-NEXT: pslld $24, %xmm1
+; CHECK-NEXT: psrad $24, %xmm1
+; CHECK-NEXT: pslld $24, %xmm0
+; CHECK-NEXT: psrad $24, %xmm0
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %esi
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: pinsrd $2, %edx, %xmm2
+; CHECK-NEXT: pextrd $3, %xmm0, %eax
+; CHECK-NEXT: pextrd $3, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: pinsrd $3, %edx, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%rem.r = srem <4 x i8> %num, %rem
Modified: llvm/trunk/test/CodeGen/X86/select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/select.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/select.ll (original)
+++ llvm/trunk/test/CodeGen/X86/select.ll Tue Aug 6 16:00:43 2019
@@ -215,27 +215,17 @@ entry:
}
define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
-; GENERIC-LABEL: test5:
-; GENERIC: ## %bb.0:
-; GENERIC-NEXT: testb $1, %dil
-; GENERIC-NEXT: jne LBB4_2
-; GENERIC-NEXT: ## %bb.1:
-; GENERIC-NEXT: movaps %xmm1, %xmm0
-; GENERIC-NEXT: LBB4_2:
-; GENERIC-NEXT: movss %xmm0, (%rsi)
-; GENERIC-NEXT: retq
-;
-; ATOM-LABEL: test5:
-; ATOM: ## %bb.0:
-; ATOM-NEXT: testb $1, %dil
-; ATOM-NEXT: jne LBB4_2
-; ATOM-NEXT: ## %bb.1:
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: LBB4_2:
-; ATOM-NEXT: movss %xmm0, (%rsi)
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; CHECK-LABEL: test5:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: jne LBB4_2
+; CHECK-NEXT: ## %bb.1:
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: LBB4_2:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: movd %xmm0, (%rsi)
+; CHECK-NEXT: retq
;
; ATHLON-LABEL: test5:
; ATHLON: ## %bb.0:
Modified: llvm/trunk/test/CodeGen/X86/shift-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shift-combine.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shift-combine.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shift-combine.ll Tue Aug 6 16:00:43 2019
@@ -226,13 +226,9 @@ define <4 x i32> @ashr_add_shl_v4i8(<4 x
;
; X64-LABEL: ashr_add_shl_v4i8:
; X64: # %bb.0:
-; X64-NEXT: pand {{.*}}(%rip), %xmm0
-; X64-NEXT: packuswb %xmm0, %xmm0
-; X64-NEXT: packuswb %xmm0, %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
-; X64-NEXT: psubb %xmm1, %xmm0
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-NEXT: psubd %xmm1, %xmm0
+; X64-NEXT: pslld $24, %xmm0
; X64-NEXT: psrad $24, %xmm0
; X64-NEXT: retq
%conv = shl <4 x i32> %r, <i32 24, i32 24, i32 24, i32 24>
Modified: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shrink_vmul.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll Tue Aug 6 16:00:43 2019
@@ -42,13 +42,10 @@ define void @mul_2xi8(i8* nocapture read
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT: vmovd %edx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm1
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -71,13 +68,10 @@ define void @mul_2xi8(i8* nocapture read
; X64-AVX-LABEL: mul_2xi8:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm1
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -481,11 +475,10 @@ define void @mul_2xi16(i8* nocapture rea
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -505,11 +498,10 @@ define void @mul_2xi16(i8* nocapture rea
; X64-AVX-LABEL: mul_2xi16:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -912,13 +904,10 @@ define void @mul_2xi8_sext(i8* nocapture
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT: vmovd %edx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm1
-; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -943,13 +932,10 @@ define void @mul_2xi8_sext(i8* nocapture
; X64-AVX-LABEL: mul_2xi8_sext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm1
-; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -1006,13 +992,10 @@ define void @mul_2xi8_sext_zext(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT: vmovd %edx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm1
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -1038,13 +1021,10 @@ define void @mul_2xi8_sext_zext(i8* noca
; X64-AVX-LABEL: mul_2xi8_sext_zext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm1
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -1095,11 +1075,10 @@ define void @mul_2xi16_sext(i8* nocaptur
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -1119,11 +1098,10 @@ define void @mul_2xi16_sext(i8* nocaptur
; X64-AVX-LABEL: mul_2xi16_sext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -1160,15 +1138,14 @@ define void @mul_2xi16_sext_zext(i8* noc
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
@@ -1179,11 +1156,10 @@ define void @mul_2xi16_sext_zext(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
+; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -1194,25 +1170,23 @@ define void @mul_2xi16_sext_zext(i8* noc
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_sext_zext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
+; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -1405,8 +1379,8 @@ define void @mul_2xi8_varconst1(i8* noca
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1415,10 +1389,9 @@ define void @mul_2xi8_varconst1(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1429,18 +1402,17 @@ define void @mul_2xi8_varconst1(i8* noca
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1482,10 +1454,9 @@ define void @mul_2xi8_varconst2(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1505,10 +1476,9 @@ define void @mul_2xi8_varconst2(i8* noca
; X64-AVX-LABEL: mul_2xi8_varconst2:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1539,8 +1509,11 @@ define void @mul_2xi8_varconst3(i8* noca
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; X86-SSE-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1549,10 +1522,9 @@ define void @mul_2xi8_varconst3(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1563,18 +1535,20 @@ define void @mul_2xi8_varconst3(i8* noca
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst3:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1618,10 +1592,9 @@ define void @mul_2xi8_varconst4(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1643,10 +1616,9 @@ define void @mul_2xi8_varconst4(i8* noca
; X64-AVX-LABEL: mul_2xi8_varconst4:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1690,10 +1662,9 @@ define void @mul_2xi8_varconst5(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1715,10 +1686,9 @@ define void @mul_2xi8_varconst5(i8* noca
; X64-AVX-LABEL: mul_2xi8_varconst5:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1762,10 +1732,9 @@ define void @mul_2xi8_varconst6(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1787,10 +1756,9 @@ define void @mul_2xi8_varconst6(i8* noca
; X64-AVX-LABEL: mul_2xi8_varconst6:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1831,9 +1799,9 @@ define void @mul_2xi16_varconst1(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1852,9 +1820,9 @@ define void @mul_2xi16_varconst1(i8* noc
; X64-AVX-LABEL: mul_2xi16_varconst1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1895,9 +1863,9 @@ define void @mul_2xi16_varconst2(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1916,9 +1884,9 @@ define void @mul_2xi16_varconst2(i8* noc
; X64-AVX-LABEL: mul_2xi16_varconst2:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1948,12 +1916,9 @@ define void @mul_2xi16_varconst3(i8* noc
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1962,9 +1927,9 @@ define void @mul_2xi16_varconst3(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1974,21 +1939,18 @@ define void @mul_2xi16_varconst3(i8* noc
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst3:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -2018,12 +1980,9 @@ define void @mul_2xi16_varconst4(i8* noc
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -2032,9 +1991,9 @@ define void @mul_2xi16_varconst4(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -2044,21 +2003,18 @@ define void @mul_2xi16_varconst4(i8* noc
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst4:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll Tue Aug 6 16:00:43 2019
@@ -44,12 +44,32 @@ define void @shuffle_v16i8_to_v8i8_1(<16
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v8i8_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -80,12 +100,31 @@ define void @shuffle_v8i16_to_v4i16_1(<8
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v8i16_to_v4i16_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
store <4 x i16> %strided.vec, <4 x i16>* %S
@@ -105,11 +144,29 @@ define void @shuffle_v4i32_to_v2i32_1(<4
; AVX-NEXT: vmovlps %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v4i32_to_v2i32_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
-; AVX512-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
+; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %L
%strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
store <2 x i32> %strided.vec, <2 x i32>* %S
@@ -147,12 +204,32 @@ define void @shuffle_v16i8_to_v4i8_1(<16
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -186,12 +263,31 @@ define void @shuffle_v16i8_to_v4i8_2(<16
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -229,12 +325,31 @@ define void @shuffle_v16i8_to_v4i8_3(<16
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8_3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -279,9 +394,8 @@ define void @shuffle_v8i16_to_v2i16_1(<8
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
@@ -293,9 +407,8 @@ define void @shuffle_v8i16_to_v2i16_1(<8
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
@@ -341,9 +454,8 @@ define void @shuffle_v8i16_to_v2i16_2(<8
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
@@ -355,9 +467,8 @@ define void @shuffle_v8i16_to_v2i16_2(<8
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
@@ -403,9 +514,8 @@ define void @shuffle_v8i16_to_v2i16_3(<8
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
@@ -417,9 +527,8 @@ define void @shuffle_v8i16_to_v2i16_3(<8
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
@@ -456,12 +565,32 @@ define void @shuffle_v16i8_to_v2i8_1(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
store <2 x i8> %strided.vec, <2 x i8>* %S
@@ -494,12 +623,31 @@ define void @shuffle_v16i8_to_v2i8_2(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
store <2 x i8> %strided.vec, <2 x i8>* %S
@@ -535,12 +683,31 @@ define void @shuffle_v16i8_to_v2i8_3(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
store <2 x i8> %strided.vec, <2 x i8>* %S
@@ -573,12 +740,31 @@ define void @shuffle_v16i8_to_v2i8_4(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_4:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
store <2 x i8> %strided.vec, <2 x i8>* %S
@@ -614,12 +800,31 @@ define void @shuffle_v16i8_to_v2i8_5(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_5:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $40, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
store <2 x i8> %strided.vec, <2 x i8>* %S
@@ -652,12 +857,31 @@ define void @shuffle_v16i8_to_v2i8_6(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_6:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
store <2 x i8> %strided.vec, <2 x i8>* %S
@@ -693,12 +917,31 @@ define void @shuffle_v16i8_to_v2i8_7(<16
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8_7:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $56, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
store <2 x i8> %strided.vec, <2 x i8>* %S
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll Tue Aug 6 16:00:43 2019
@@ -125,16 +125,49 @@ define void @shuffle_v32i8_to_v8i8_1(<32
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -153,16 +186,46 @@ define void @shuffle_v32i8_to_v8i8_2(<32
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -181,16 +244,49 @@ define void @shuffle_v32i8_to_v8i8_3(<32
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -243,11 +339,11 @@ define void @shuffle_v16i16_to_v4i16_1(<
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
@@ -264,9 +360,9 @@ define void @shuffle_v16i16_to_v4i16_1(<
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,4,5,12,13]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -318,13 +414,9 @@ define void @shuffle_v16i16_to_v4i16_2(<
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
@@ -340,10 +432,9 @@ define void @shuffle_v16i16_to_v4i16_2(<
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,2,3,10,11]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -397,11 +488,11 @@ define void @shuffle_v16i16_to_v4i16_3(<
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
@@ -418,9 +509,9 @@ define void @shuffle_v16i16_to_v4i16_3(<
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,11,15,2,3,10,11]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
@@ -440,16 +531,49 @@ define void @shuffle_v32i8_to_v4i8_1(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -468,16 +592,46 @@ define void @shuffle_v32i8_to_v4i8_2(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -496,16 +650,49 @@ define void @shuffle_v32i8_to_v4i8_3(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -524,16 +711,41 @@ define void @shuffle_v32i8_to_v4i8_4(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -552,16 +764,49 @@ define void @shuffle_v32i8_to_v4i8_5(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -580,16 +825,46 @@ define void @shuffle_v32i8_to_v4i8_6(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -608,16 +883,49 @@ define void @shuffle_v32i8_to_v4i8_7(<32
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
store <4 x i8> %strided.vec, <4 x i8>* %S
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll Tue Aug 6 16:00:43 2019
@@ -425,23 +425,77 @@ define void @shuffle_v32i16_to_v8i16_3(<
}
define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -449,23 +503,68 @@ define void @shuffle_v64i8_to_v8i8_1(<64
}
define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -473,23 +572,77 @@ define void @shuffle_v64i8_to_v8i8_2(<64
}
define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -497,23 +650,68 @@ define void @shuffle_v64i8_to_v8i8_3(<64
}
define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_4:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -521,23 +719,77 @@ define void @shuffle_v64i8_to_v8i8_4(<64
}
define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_5:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -545,23 +797,68 @@ define void @shuffle_v64i8_to_v8i8_5(<64
}
define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_6:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -569,23 +866,77 @@ define void @shuffle_v64i8_to_v8i8_6(<64
}
define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8_7:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
store <8 x i8> %strided.vec, <8 x i8>* %S
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll Tue Aug 6 16:00:43 2019
@@ -37,12 +37,32 @@ define void @shuffle_v16i8_to_v8i8(<16 x
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -128,12 +148,31 @@ define void @shuffle_v8i16_to_v4i16(<8 x
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
store <4 x i16> %strided.vec, <4 x i16>* %S
@@ -208,11 +247,29 @@ define void @shuffle_v4i32_to_v2i32(<4 x
; AVX-NEXT: vmovlps %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %L
%strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
store <2 x i32> %strided.vec, <2 x i32>* %S
@@ -286,12 +343,31 @@ define void @shuffle_v16i8_to_v4i8(<16 x
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -393,8 +469,7 @@ define void @shuffle_v8i16_to_v2i16(<8 x
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
@@ -407,8 +482,7 @@ define void @shuffle_v8i16_to_v2i16(<8 x
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
@@ -503,12 +577,31 @@ define void @shuffle_v16i8_to_v2i8(<16 x
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
store <2 x i8> %strided.vec, <2 x i8>* %S
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll Tue Aug 6 16:00:43 2019
@@ -408,20 +408,17 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -430,21 +427,32 @@ define void @shuffle_v32i8_to_v8i8(<32 x
}
define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX-LABEL: trunc_v8i32_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v8i32_to_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -459,7 +467,8 @@ define void @trunc_v8i32_to_v8i8(<32 x i
; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -490,59 +499,53 @@ define <2 x i64> @trunc_v8i32_to_v8i8_re
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated.vec = trunc <8 x i32> %vec to <8 x i8>
@@ -674,59 +677,53 @@ define <16 x i8> @trunc_v8i32_to_v8i8_re
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <8 x i32> %vec to <8 x i8>
@@ -740,72 +737,58 @@ define <2 x i64> @trunc_v4i64_to_v4i16_r
; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
@@ -947,72 +930,58 @@ define <8 x i16> @trunc_v4i64_to_v4i16_r
; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
@@ -1024,66 +993,58 @@ define <16 x i8> @trunc_v4i64_to_v4i8_re
; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i8>
@@ -1135,13 +1096,9 @@ define void @shuffle_v16i16_to_v4i16(<16
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
@@ -1157,18 +1114,16 @@ define void @shuffle_v16i16_to_v4i16(<16
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
-; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512VBMIVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1179,39 +1134,34 @@ define void @shuffle_v16i16_to_v4i16(<16
define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT: vmovaps (%rdi), %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1226,7 +1176,8 @@ define void @trunc_v4i64_to_v4i16(<16 x
; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1276,13 +1227,9 @@ define void @shuffle_v32i8_to_v4i8(<32 x
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
@@ -1298,21 +1245,16 @@ define void @shuffle_v32i8_to_v4i8(<32 x
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512VBMIVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
@@ -1321,21 +1263,36 @@ define void @shuffle_v32i8_to_v4i8(<32 x
}
define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX-LABEL: trunc_v4i64_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v4i64_to_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovaps (%rdi), %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1350,7 +1307,8 @@ define void @trunc_v4i64_to_v4i8(<32 x i
; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll Tue Aug 6 16:00:43 2019
@@ -585,20 +585,11 @@ define void @shuffle_v64i8_to_v8i8(<64 x
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
@@ -621,10 +612,10 @@ define void @shuffle_v64i8_to_v8i8(<64 x
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
-; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -893,7 +884,6 @@ define <16 x i8> @trunc_v8i64_to_v8i8_re
; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%truncated = trunc <8 x i64> %vec to <8 x i8>
Modified: llvm/trunk/test/CodeGen/X86/slow-pmulld.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/slow-pmulld.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/slow-pmulld.ll (original)
+++ llvm/trunk/test/CodeGen/X86/slow-pmulld.ll Tue Aug 6 16:00:43 2019
@@ -20,74 +20,74 @@
define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
; CHECK32-LABEL: test_mul_v4i32_v4i8:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_mul_v4i32_v4i8:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i8:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i8:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX2-32-LABEL: test_mul_v4i32_v4i8:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v4i32_v4i8:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v4i32_v4i8:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v4i32_v4i8:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-64-NEXT: retq
@@ -99,31 +99,34 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4
define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLM32-LABEL: test_mul_v8i32_v8i8:
; SLM32: # %bb.0:
-; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SLM32-NEXT: movdqa %xmm0, %xmm1
+; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm1
; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM32-NEXT: movdqa %xmm1, %xmm2
; SLM32-NEXT: pmullw %xmm0, %xmm1
; SLM32-NEXT: pmulhw %xmm0, %xmm2
; SLM32-NEXT: movdqa %xmm1, %xmm0
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v8i32_v8i8:
; SLM64: # %bb.0:
-; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SLM64-NEXT: movdqa %xmm0, %xmm1
+; SLM64-NEXT: pand {{.*}}(%rip), %xmm1
; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM64-NEXT: movdqa %xmm1, %xmm2
; SLM64-NEXT: pmullw %xmm0, %xmm1
; SLM64-NEXT: pmulhw %xmm0, %xmm2
; SLM64-NEXT: movdqa %xmm1, %xmm0
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v8i32_v8i8:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SLOW32-NEXT: movdqa %xmm0, %xmm1
+; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm1
; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLOW32-NEXT: movdqa %xmm1, %xmm2
; SLOW32-NEXT: pmulhw %xmm0, %xmm2
@@ -135,7 +138,8 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8
;
; SLOW64-LABEL: test_mul_v8i32_v8i8:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SLOW64-NEXT: movdqa %xmm0, %xmm1
+; SLOW64-NEXT: pand {{.*}}(%rip), %xmm1
; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLOW64-NEXT: movdqa %xmm1, %xmm2
; SLOW64-NEXT: pmulhw %xmm0, %xmm2
@@ -147,9 +151,10 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8
;
; SSE4-32-LABEL: test_mul_v8i32_v8i8:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
@@ -157,9 +162,10 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8
;
; SSE4-64-LABEL: test_mul_v8i32_v8i8:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
@@ -167,50 +173,58 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8
;
; AVX2-32-LABEL: test_mul_v8i32_v8i8:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v8i32_v8i8:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v8i32_v8i8:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v8i32_v8i8:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-64-NEXT: retq
@@ -395,46 +409,72 @@ define <16 x i32> @test_mul_v16i32_v16i8
}
define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
-; CHECK32-LABEL: test_mul_v4i32_v4i16:
-; CHECK32: # %bb.0:
-; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; CHECK32-NEXT: movdqa %xmm0, %xmm2
-; CHECK32-NEXT: pmulhuw %xmm1, %xmm2
-; CHECK32-NEXT: pmullw %xmm1, %xmm0
-; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK32-NEXT: retl
+; SLM32-LABEL: test_mul_v4i32_v4i16:
+; SLM32: # %bb.0:
+; SLM32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SLM32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; SLM32-NEXT: movdqa %xmm0, %xmm2
+; SLM32-NEXT: pmullw %xmm1, %xmm0
+; SLM32-NEXT: pmulhuw %xmm1, %xmm2
+; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLM32-NEXT: retl
;
-; CHECK64-LABEL: test_mul_v4i32_v4i16:
-; CHECK64: # %bb.0:
-; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; CHECK64-NEXT: movdqa %xmm0, %xmm2
-; CHECK64-NEXT: pmulhuw %xmm1, %xmm2
-; CHECK64-NEXT: pmullw %xmm1, %xmm0
-; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK64-NEXT: retq
+; SLM64-LABEL: test_mul_v4i32_v4i16:
+; SLM64: # %bb.0:
+; SLM64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SLM64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; SLM64-NEXT: movdqa %xmm0, %xmm2
+; SLM64-NEXT: pmullw %xmm1, %xmm0
+; SLM64-NEXT: pmulhuw %xmm1, %xmm2
+; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLM64-NEXT: retq
+;
+; SLOW32-LABEL: test_mul_v4i32_v4i16:
+; SLOW32: # %bb.0:
+; SLOW32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; SLOW32-NEXT: movdqa %xmm0, %xmm2
+; SLOW32-NEXT: pmulhuw %xmm1, %xmm2
+; SLOW32-NEXT: pmullw %xmm1, %xmm0
+; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLOW32-NEXT: retl
+;
+; SLOW64-LABEL: test_mul_v4i32_v4i16:
+; SLOW64: # %bb.0:
+; SLOW64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; SLOW64-NEXT: movdqa %xmm0, %xmm2
+; SLOW64-NEXT: pmulhuw %xmm1, %xmm2
+; SLOW64-NEXT: pmullw %xmm1, %xmm0
+; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLOW64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i16:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT: pxor %xmm1, %xmm1
+; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i16:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT: pxor %xmm1, %xmm1
+; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX-32-LABEL: test_mul_v4i32_v4i16:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_mul_v4i32_v4i16:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
@@ -682,74 +722,74 @@ define <16 x i32> @test_mul_v16i32_v16i1
define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-64-NEXT: retq
@@ -761,29 +801,32 @@ define <4 x i32> @test_mul_v4i32_v4i8_mi
define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM32: # %bb.0:
+; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm0
; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLM32-NEXT: pmaddwd %xmm2, %xmm0
; SLM32-NEXT: pmaddwd %xmm2, %xmm1
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM64: # %bb.0:
+; SLM64-NEXT: pand {{.*}}(%rip), %xmm0
; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLM64-NEXT: pmaddwd %xmm2, %xmm0
; SLM64-NEXT: pmaddwd %xmm2, %xmm1
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
@@ -791,9 +834,10 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
;
; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW64-NEXT: pand {{.*}}(%rip), %xmm0
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
@@ -801,9 +845,10 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
;
; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
@@ -811,9 +856,10 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
;
; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
@@ -821,50 +867,58 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
;
; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-64-NEXT: retq
@@ -1033,38 +1087,44 @@ define <16 x i32> @test_mul_v16i32_v16i8
define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK32-NEXT: pxor %xmm1, %xmm1
+; CHECK32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK64-NEXT: pxor %xmm1, %xmm1
+; CHECK64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT: pxor %xmm1, %xmm1
+; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT: pxor %xmm1, %xmm1
+; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-canonical.ll Tue Aug 6 16:00:43 2019
@@ -93,17 +93,40 @@ define <8 x i16> @test_x86_sse2_psubus_w
define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_paddus_b_64:
; SSE: ## %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4
+; SSE-NEXT: pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca]
+; SSE-NEXT: packuswb %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x67,0xc9]
+; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT: packuswb %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc0]
; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1]
+; SSE-NEXT: punpcklbw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x60,0xc0]
+; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_paddus_b_64:
; AVX2: ## %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca]
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
+; AVX2-NEXT: vpmovzxbw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x30,0xc0]
+; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_paddus_b_64:
; SKX: ## %bb.0:
+; SKX-NEXT: vmovdqa LCPI4_0, %xmm2 ## EVEX TO VEX Compression xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4
+; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca]
+; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
+; SKX-NEXT: vpmovzxbw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xc0]
+; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: retl ## encoding: [0xc3]
%1 = add <8 x i8> %a0, %a1
%2 = icmp ugt <8 x i8> %a0, %1
@@ -114,17 +137,45 @@ define <8 x i8> @test_x86_sse2_paddus_b_
define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_paddus_w_64:
; SSE: ## %bb.0:
+; SSE-NEXT: pshuflw $232, %xmm1, %xmm1 ## encoding: [0xf2,0x0f,0x70,0xc9,0xe8]
+; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw $232, %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x70,0xc9,0xe8]
+; SSE-NEXT: ## xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd $232, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xe8]
+; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw $232, %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x70,0xc0,0xe8]
+; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw $232, %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x70,0xc0,0xe8]
+; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0xe8]
+; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1]
+; SSE-NEXT: punpcklwd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x61,0xc0]
+; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_paddus_w_64:
; AVX2: ## %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca]
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
+; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_paddus_w_64:
; SKX: ## %bb.0:
+; SKX-NEXT: vmovdqa LCPI5_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
+; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca]
+; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
+; SKX-NEXT: vpmovzxwd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SKX-NEXT: retl ## encoding: [0xc3]
%1 = add <4 x i16> %a0, %a1
%2 = icmp ugt <4 x i16> %a0, %1
@@ -135,17 +186,36 @@ define <4 x i16> @test_x86_sse2_paddus_w
define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_psubus_b_64:
; SSE: ## %bb.0:
-; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT: ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4
+; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9]
+; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda]
+; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT: pmaxsw %xmm3, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc3]
+; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubus_b_64:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2]
+; AVX2-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
+; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubus_b_64:
; SKX: ## %bb.0:
-; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
+; SKX-NEXT: vpbroadcastw LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,255,255,255,255,255,255,255]
+; SKX-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
+; SKX-NEXT: vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda]
+; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2]
+; SKX-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
+; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%cmp = icmp ugt <8 x i8> %a0, %a1
%sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1
@@ -156,17 +226,41 @@ define <8 x i8> @test_x86_sse2_psubus_b_
define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psubus_w_64:
; SSE: ## %bb.0:
-; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4
+; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9]
+; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda]
+; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0]
+; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3]
+; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3]
+; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0]
+; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1]
+; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubus_w_64:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
+; AVX2-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa]
+; AVX2-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
+; AVX2-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc3]
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubus_w_64:
; SKX: ## %bb.0:
-; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa]
+; SKX-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
+; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SKX-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc3]
+; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%cmp = icmp ugt <4 x i16> %a0, %a1
%sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1
Modified: llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll Tue Aug 6 16:00:43 2019
@@ -321,9 +321,8 @@ define <4 x i32> @shl_srl_v4i32(<4 x i32
define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_srl_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
%zext = zext <4 x i16> %srl to <4 x i32>
@@ -335,7 +334,6 @@ define <4 x i16> @sra_trunc_srl_v4i32(<4
; CHECK-LABEL: sra_trunc_srl_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: psrad $19, %xmm0
-; CHECK-NEXT: packssdw %xmm0, %xmm0
; CHECK-NEXT: retq
%srl = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
%trunc = trunc <4 x i32> %srl to <4 x i16>
@@ -346,7 +344,6 @@ define <4 x i16> @sra_trunc_srl_v4i32(<4
define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_shl_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-NEXT: pslld $19, %xmm0
; CHECK-NEXT: retq
%shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
Modified: llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ssub_sat_vec.ll Tue Aug 6 16:00:43 2019
@@ -569,141 +569,248 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: psubd %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: psubd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pandn %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: psrld $1, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm4
+; SSSE3-NEXT: pandn %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psubq %xmm1, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm6, %xmm1
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pandn %xmm1, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm3, %xmm1
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
-; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
Modified: llvm/trunk/test/CodeGen/X86/test-shrink-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/test-shrink-bug.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/test-shrink-bug.ll (original)
+++ llvm/trunk/test/CodeGen/X86/test-shrink-bug.ll Tue Aug 6 16:00:43 2019
@@ -69,9 +69,11 @@ define void @fail(i16 %a, <2 x i8> %b) {
; CHECK-X64-NEXT: testl $263, %edi # imm = 0x107
; CHECK-X64-NEXT: je .LBB1_3
; CHECK-X64-NEXT: # %bb.1:
-; CHECK-X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
-; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-X64-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-X64-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; CHECK-X64-NEXT: pand %xmm0, %xmm1
+; CHECK-X64-NEXT: pextrw $4, %xmm1, %eax
; CHECK-X64-NEXT: testb $1, %al
; CHECK-X64-NEXT: jne .LBB1_3
; CHECK-X64-NEXT: # %bb.2: # %no
Modified: llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll (original)
+++ llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll Tue Aug 6 16:00:43 2019
@@ -8,16 +8,23 @@ define void @load_2_i8(<2 x i8>* %A) {
; SSE2: # %bb.0:
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: paddb {{.*}}(%rip), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_2_i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: paddb {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-NEXT: retq
%T = load <2 x i8>, <2 x i8>* %A
@@ -28,12 +35,25 @@ define void @load_2_i8(<2 x i8>* %A) {
; Read 32-bits
define void @load_2_i16(<2 x i16>* %A) {
-; CHECK-LABEL: load_2_i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
-; CHECK-NEXT: movd %xmm0, (%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: load_2_i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: load_2_i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE41-NEXT: movd %xmm0, (%rdi)
+; SSE41-NEXT: retq
%T = load <2 x i16>, <2 x i16>* %A
%G = add <2 x i16> %T, <i16 9, i16 7>
store <2 x i16> %G, <2 x i16>* %A
@@ -41,12 +61,22 @@ define void @load_2_i16(<2 x i16>* %A)
}
define void @load_2_i32(<2 x i32>* %A) {
-; CHECK-LABEL: load_2_i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
-; CHECK-NEXT: movq %xmm0, (%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: load_2_i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: load_2_i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: retq
%T = load <2 x i32>, <2 x i32>* %A
%G = add <2 x i32> %T, <i32 9, i32 7>
store <2 x i32> %G, <2 x i32>* %A
@@ -54,12 +84,25 @@ define void @load_2_i32(<2 x i32>* %A)
}
define void @load_4_i8(<4 x i8>* %A) {
-; CHECK-LABEL: load_4_i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0
-; CHECK-NEXT: movd %xmm0, (%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: load_4_i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: load_4_i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movd %xmm0, (%rdi)
+; SSE41-NEXT: retq
%T = load <4 x i8>, <4 x i8>* %A
%G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7>
store <4 x i8> %G, <4 x i8>* %A
@@ -67,12 +110,24 @@ define void @load_4_i8(<4 x i8>* %A) {
}
define void @load_4_i16(<4 x i16>* %A) {
-; CHECK-LABEL: load_4_i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
-; CHECK-NEXT: movq %xmm0, (%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: load_4_i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: load_4_i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: retq
%T = load <4 x i16>, <4 x i16>* %A
%G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7>
store <4 x i16> %G, <4 x i16>* %A
Modified: llvm/trunk/test/CodeGen/X86/trunc-subvector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/trunc-subvector.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/trunc-subvector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/trunc-subvector.ll Tue Aug 6 16:00:43 2019
@@ -40,14 +40,25 @@ define <4 x i32> @test2(<8 x i32> %v) {
define <2 x i32> @test3(<8 x i32> %v) {
; SSE2-LABEL: test3:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX-LABEL: test3:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: test3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
%t = trunc <2 x i64> %s to <2 x i32>
@@ -57,13 +68,23 @@ define <2 x i32> @test3(<8 x i32> %v) {
define <2 x i32> @test4(<8 x i32> %v) {
; SSE2-LABEL: test4:
; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX-LABEL: test4:
-; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: test4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
%t = trunc <2 x i64> %s to <2 x i32>
@@ -73,31 +94,32 @@ define <2 x i32> @test4(<8 x i32> %v) {
define <2 x i32> @test5(<8 x i32> %v) {
; SSE2-LABEL: test5:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
; SSE2-NEXT: retq
;
; AVX2-LABEL: test5:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test5:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
@@ -143,13 +165,23 @@ define <2 x i32> @test8(<8 x i32> %v) {
; SSE2-LABEL: test8:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX-LABEL: test8:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: test8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
%t = trunc <2 x i64> %s to <2 x i32>
@@ -159,13 +191,22 @@ define <2 x i32> @test8(<8 x i32> %v) {
define <2 x i32> @test9(<8 x i32> %v) {
; SSE2-LABEL: test9:
; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX-LABEL: test9:
-; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: test9:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test9:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
%t = trunc <2 x i64> %s to <2 x i32>
@@ -175,31 +216,28 @@ define <2 x i32> @test9(<8 x i32> %v) {
define <2 x i32> @test10(<8 x i32> %v) {
; SSE2-LABEL: test10:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
; SSE2-NEXT: retq
;
; AVX2-LABEL: test10:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test10:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
Modified: llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/uadd_sat_vec.ll Tue Aug 6 16:00:43 2019
@@ -542,54 +542,96 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrlq $32, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: paddq %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: psrlq $32, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: paddq %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm3
; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlq $32, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vmovdqa %xmm1, %xmm2
; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2
-; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
Modified: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll (original)
+++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll Tue Aug 6 16:00:43 2019
@@ -68,13 +68,17 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <
; CHECK-SSE2-LABEL: out_v2i8:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v2i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <2 x i8> %x, %mask
%notmask = xor <2 x i8> %mask, <i8 -1, i8 -1>
@@ -170,13 +174,17 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <
; CHECK-SSE2-LABEL: out_v4i8:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -239,13 +247,17 @@ define <4 x i8> @out_v4i8_undef(<4 x i8>
; CHECK-SSE2-LABEL: out_v4i8_undef:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i8_undef:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -288,13 +300,17 @@ define <2 x i16> @out_v2i16(<2 x i16> %x
; CHECK-SSE2-LABEL: out_v2i16:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v2i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <2 x i16> %x, %mask
%notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
@@ -467,13 +483,17 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <
; CHECK-SSE2-LABEL: out_v8i8:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v8i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <8 x i8> %x, %mask
%notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -548,13 +568,17 @@ define <4 x i16> @out_v4i16(<4 x i16> %x
; CHECK-SSE2-LABEL: out_v4i16:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i16> %x, %mask
%notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
@@ -617,13 +641,17 @@ define <4 x i16> @out_v4i16_undef(<4 x i
; CHECK-SSE2-LABEL: out_v4i16_undef:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i16_undef:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i16> %x, %mask
%notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -664,13 +692,17 @@ define <2 x i32> @out_v2i32(<2 x i32> %x
; CHECK-SSE2-LABEL: out_v2i32:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v2i32:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <2 x i32> %x, %mask
%notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
Modified: llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll Tue Aug 6 16:00:43 2019
@@ -542,37 +542,98 @@ define <16 x i1> @v16i1(<16 x i1> %x, <1
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: psllq $32, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: psrlq $32, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: psllq $32, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: psubd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: psrlq $32, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: psubq %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlq $32, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
}
Modified: llvm/trunk/test/CodeGen/X86/vec_cast2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cast2.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_cast2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_cast2.ll Tue Aug 6 16:00:43 2019
@@ -5,10 +5,13 @@
define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) {
; CHECK-LABEL: cvt_v8i8_v8f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT: vpslld $24, %xmm1, %xmm1
+; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
;
@@ -49,7 +52,8 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x
define <4 x float> @cvt_v4i8_v4f32(<4 x i8> %src) {
; CHECK-LABEL: cvt_v4i8_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -65,7 +69,8 @@ define <4 x float> @cvt_v4i8_v4f32(<4 x
define <4 x float> @cvt_v4i16_v4f32(<4 x i16> %src) {
; CHECK-LABEL: cvt_v4i16_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT: vpslld $16, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -81,10 +86,11 @@ define <4 x float> @cvt_v4i16_v4f32(<4 x
define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) {
; CHECK-LABEL: cvt_v8u8_v8f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpand LCPI4_0, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
;
@@ -125,7 +131,7 @@ define <8 x float> @cvt_v8u16_v8f32(<8 x
define <4 x float> @cvt_v4u8_v4f32(<4 x i8> %src) {
; CHECK-LABEL: cvt_v4u8_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vandps LCPI6_0, %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -141,7 +147,8 @@ define <4 x float> @cvt_v4u8_v4f32(<4 x
define <4 x float> @cvt_v4u16_v4f32(<4 x i16> %src) {
; CHECK-LABEL: cvt_v4u16_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -160,7 +167,6 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x flo
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
;
@@ -200,7 +206,6 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x flo
; CHECK-LABEL: cvt_v4f32_v4i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4i8:
@@ -216,7 +221,6 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x f
; CHECK-LABEL: cvt_v4f32_v4i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4i16:
@@ -233,8 +237,7 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x flo
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
;
@@ -274,7 +277,6 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x flo
; CHECK-LABEL: cvt_v4f32_v4u8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4u8:
@@ -290,7 +292,6 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x f
; CHECK-LABEL: cvt_v4f32_v4u16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4u16:
Modified: llvm/trunk/test/CodeGen/X86/vec_cast3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cast3.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_cast3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_cast3.ll Tue Aug 6 16:00:43 2019
@@ -5,7 +5,9 @@
define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) {
; CHECK-LABEL: cvt_v2i8_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT: vpsllq $56, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -21,7 +23,9 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x
define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) {
; CHECK-LABEL: cvt_v2i16_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT: vpsllq $48, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -37,6 +41,7 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x
define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) {
; CHECK-LABEL: cvt_v2i32_v2f32:
; CHECK: ## %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -51,7 +56,7 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x
define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) {
; CHECK-LABEL: cvt_v2u8_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -67,7 +72,7 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x
define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) {
; CHECK-LABEL: cvt_v2u16_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,xmm0[10,11],zero,zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
@@ -83,9 +88,10 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x
define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
; CHECK-LABEL: cvt_v2u32_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
; CHECK-NEXT: retl
@@ -106,7 +112,7 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x flo
; CHECK-LABEL: cvt_v2f32_v2i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
@@ -122,7 +128,7 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x f
; CHECK-LABEL: cvt_v2f32_v2i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
@@ -138,6 +144,7 @@ define <2 x i32> @cvt_v2f32_v2i32(<2 x f
; CHECK-LABEL: cvt_v2f32_v2i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i32:
@@ -152,7 +159,7 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x flo
; CHECK-LABEL: cvt_v2f32_v2u8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
@@ -168,7 +175,7 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x f
; CHECK-LABEL: cvt_v2f32_v2u16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
@@ -183,13 +190,39 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x f
define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) {
; CHECK-LABEL: cvt_v2f32_v2u32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; CHECK-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1
-; CHECK-NEXT: vxorps LCPI11_1, %xmm1, %xmm1
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: subl $36, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vucomiss %xmm1, %xmm2
+; CHECK-NEXT: jb LBB11_2
+; CHECK-NEXT: ## %bb.1:
+; CHECK-NEXT: vsubss %xmm1, %xmm2, %xmm2
+; CHECK-NEXT: LBB11_2:
+; CHECK-NEXT: vmovss %xmm2, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: fisttpll (%esp)
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: shll $31, %eax
+; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
+; CHECK-NEXT: jb LBB11_4
+; CHECK-NEXT: ## %bb.3:
+; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: LBB11_4:
+; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp)
+; CHECK-NEXT: setae %cl
+; CHECK-NEXT: movzbl %cl, %ecx
+; CHECK-NEXT: shll $31, %ecx
+; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: addl $36, %esp
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
Modified: llvm/trunk/test/CodeGen/X86/vec_ctbits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ctbits.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ctbits.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_ctbits.ll Tue Aug 6 16:00:43 2019
@@ -110,8 +110,9 @@ declare <2 x i32> @llvm.ctpop.v2i32(<2 x
define <2 x i32> @promtz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promtz:
; CHECK: # %bb.0:
+; CHECK-NEXT: por {{.*}}(%rip), %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: paddq %xmm0, %xmm1
; CHECK-NEXT: pandn %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $1, %xmm1
@@ -128,12 +129,7 @@ define <2 x i32> @promtz(<2 x i32> %a) n
; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT: psadbw %xmm0, %xmm2
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: packuswb %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
@@ -143,44 +139,44 @@ define <2 x i32> @promtz(<2 x i32> %a) n
define <2 x i32> @promlz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promlz:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrld $1, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrld $2, %xmm0
-; CHECK-NEXT: por %xmm1, %xmm0
-; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrld $4, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrld $8, %xmm0
-; CHECK-NEXT: por %xmm1, %xmm0
-; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrld $16, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psrlq $1, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: psrlq $2, %xmm0
+; CHECK-NEXT: por %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psrlq $4, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: psrlq $8, %xmm0
+; CHECK-NEXT: por %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psrlq $16, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: psrlq $32, %xmm0
+; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT: pxor %xmm1, %xmm2
+; CHECK-NEXT: pxor %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psubb %xmm0, %xmm2
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm2, %xmm1
-; CHECK-NEXT: pand %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm2, %xmm3
+; CHECK-NEXT: pand %xmm0, %xmm3
; CHECK-NEXT: psrlw $2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: paddb %xmm1, %xmm2
+; CHECK-NEXT: paddb %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlw $4, %xmm0
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-NEXT: psadbw %xmm1, %xmm2
-; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: psadbw %xmm1, %xmm0
-; CHECK-NEXT: packuswb %xmm2, %xmm0
+; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
ret <2 x i32> %c
@@ -190,27 +186,23 @@ define <2 x i32> @promlz(<2 x i32> %a) n
define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-LABEL: prompop:
; CHECK: # %bb.0:
+; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: psubb %xmm1, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: pand %xmm1, %xmm3
; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddb %xmm2, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT: psadbw %xmm0, %xmm2
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: packuswb %xmm2, %xmm1
+; CHECK-NEXT: psadbw %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
Modified: llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll Tue Aug 6 16:00:43 2019
@@ -115,10 +115,12 @@ define i32 @test4(x86_mmx %a) nounwind {
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-16, %esp
-; X32-NEXT: subl $32, %esp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: retl
@@ -126,7 +128,9 @@ define i32 @test4(x86_mmx %a) nounwind {
; X64-LABEL: test4:
; X64: # %bb.0:
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: retq
%tmp0 = bitcast x86_mmx %a to <2 x i32>
%tmp1 = extractelement <2 x i32> %tmp0, i32 1
Modified: llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll Tue Aug 6 16:00:43 2019
@@ -93,11 +93,13 @@ define <2 x i32> @fptosi_2f64_to_2i32(<2
; SSE-LABEL: fptosi_2f64_to_2i32:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32:
; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i32>
ret <2 x i32> %cvt
@@ -334,45 +336,53 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2
define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_4i32:
; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: subsd %xmm2, %xmm1
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttsd2si %xmm0, %rdx
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movq %rdx, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: subsd %xmm2, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rcx
+; SSE-NEXT: movq %rcx, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: fptoui_2f64_to_4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_2f64_to_4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; VEX-LABEL: fptoui_2f64_to_4i32:
+; VEX: # %bb.0:
+; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
+; VEX-NEXT: vcvttsd2si %xmm2, %rax
+; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; VEX-NEXT: xorq %rcx, %rax
+; VEX-NEXT: vcvttsd2si %xmm0, %rdx
+; VEX-NEXT: vucomisd %xmm1, %xmm0
+; VEX-NEXT: cmovaeq %rax, %rdx
+; VEX-NEXT: vmovq %rdx, %xmm2
+; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
+; VEX-NEXT: vcvttsd2si %xmm3, %rax
+; VEX-NEXT: xorq %rcx, %rax
+; VEX-NEXT: vcvttsd2si %xmm0, %rcx
+; VEX-NEXT: vucomisd %xmm1, %xmm0
+; VEX-NEXT: cmovaeq %rax, %rcx
+; VEX-NEXT: vmovq %rcx, %xmm0
+; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_4i32:
; AVX512F: # %bb.0:
@@ -407,45 +417,51 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2
define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i32:
; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movapd %xmm0, %xmm2
+; SSE-NEXT: subsd %xmm1, %xmm2
+; SSE-NEXT: cvttsd2si %xmm2, %rax
+; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttsd2si %xmm0, %rdx
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movq %rdx, %xmm2
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: subsd %xmm1, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttsd2si %xmm0, %rcx
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rcx
+; SSE-NEXT: movq %rcx, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE-NEXT: retq
;
-; AVX1-LABEL: fptoui_2f64_to_2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_2f64_to_2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; VEX-LABEL: fptoui_2f64_to_2i32:
+; VEX: # %bb.0:
+; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
+; VEX-NEXT: vcvttsd2si %xmm2, %rax
+; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; VEX-NEXT: xorq %rcx, %rax
+; VEX-NEXT: vcvttsd2si %xmm0, %rdx
+; VEX-NEXT: vucomisd %xmm1, %xmm0
+; VEX-NEXT: cmovaeq %rax, %rdx
+; VEX-NEXT: vmovq %rdx, %xmm2
+; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
+; VEX-NEXT: vcvttsd2si %xmm3, %rax
+; VEX-NEXT: xorq %rcx, %rax
+; VEX-NEXT: vcvttsd2si %xmm0, %rcx
+; VEX-NEXT: vucomisd %xmm1, %xmm0
+; VEX-NEXT: cmovaeq %rax, %rcx
+; VEX-NEXT: vmovq %rcx, %xmm0
+; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_2i32:
; AVX512F: # %bb.0:
@@ -480,13 +496,29 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2
define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_2i32:
; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: subsd %xmm2, %xmm1
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttsd2si %xmm0, %rdx
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movq %rdx, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: subsd %xmm2, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttsd2si %xmm0, %rcx
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rcx
+; SSE-NEXT: movq %rcx, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_4f64_to_2i32:
@@ -732,20 +764,46 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4
define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_4i32:
; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movapd %xmm1, %xmm3
+; SSE-NEXT: subsd %xmm2, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rcx
+; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: cvttsd2si %xmm1, %rdx
+; SSE-NEXT: ucomisd %xmm2, %xmm1
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movq %rdx, %xmm3
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: movapd %xmm1, %xmm4
+; SSE-NEXT: subsd %xmm2, %xmm4
+; SSE-NEXT: cvttsd2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: cvttsd2si %xmm1, %rdx
+; SSE-NEXT: ucomisd %xmm2, %xmm1
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movq %rdx, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: subsd %xmm2, %xmm1
+; SSE-NEXT: cvttsd2si %xmm1, %rcx
+; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: cvttsd2si %xmm0, %rdx
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movq %rdx, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movapd %xmm0, %xmm4
+; SSE-NEXT: subsd %xmm2, %xmm4
+; SSE-NEXT: cvttsd2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rcx, %rax
+; SSE-NEXT: movq %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_4f64_to_4i32:
@@ -816,11 +874,13 @@ define <2 x i32> @fptosi_2f32_to_2i32(<2
; SSE-LABEL: fptosi_2f32_to_2i32:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f32_to_2i32:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i32>
ret <2 x i32> %cvt
@@ -1199,66 +1259,77 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8
define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; SSE-LABEL: fptoui_2f32_to_2i32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: cmpltps %xmm2, %xmm1
-; SSE-NEXT: cvttps2dq %xmm0, %xmm3
-; SSE-NEXT: subps %xmm2, %xmm0
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: andnps %xmm0, %xmm1
-; SSE-NEXT: orps %xmm3, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: subss %xmm2, %xmm1
+; SSE-NEXT: cvttss2si %xmm1, %rax
+; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttss2si %xmm0, %rdx
+; SSE-NEXT: ucomiss %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movq %rdx, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: subss %xmm2, %xmm3
+; SSE-NEXT: cvttss2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttss2si %xmm0, %rcx
+; SSE-NEXT: ucomiss %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rcx
+; SSE-NEXT: movq %rcx, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: fptoui_2f32_to_2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_2f32_to_2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; VEX-LABEL: fptoui_2f32_to_2i32:
+; VEX: # %bb.0:
+; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
+; VEX-NEXT: vcvttss2si %xmm2, %rax
+; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; VEX-NEXT: xorq %rcx, %rax
+; VEX-NEXT: vcvttss2si %xmm0, %rdx
+; VEX-NEXT: vucomiss %xmm1, %xmm0
+; VEX-NEXT: cmovaeq %rax, %rdx
+; VEX-NEXT: vmovq %rdx, %xmm2
+; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3
+; VEX-NEXT: vcvttss2si %xmm3, %rax
+; VEX-NEXT: xorq %rcx, %rax
+; VEX-NEXT: vcvttss2si %xmm0, %rcx
+; VEX-NEXT: vucomiss %xmm1, %xmm0
+; VEX-NEXT: cmovaeq %rax, %rcx
+; VEX-NEXT: vmovq %rcx, %xmm0
+; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f32_to_2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i32>
ret <2 x i32> %cvt
@@ -2154,8 +2225,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2
; SSE-LABEL: fptosi_2f16_to_4i32:
; SSE: # %bb.0:
; SSE-NEXT: pushq %rax
-; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: callq __gnu_f2h_ieee
; SSE-NEXT: movzwl %ax, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
@@ -2165,20 +2235,20 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2
; SSE-NEXT: callq __gnu_f2h_ieee
; SSE-NEXT: movzwl %ax, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
-; SSE-NEXT: cvttss2si %xmm0, %eax
-; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; SSE-NEXT: movd %ecx, %xmm0
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: movq %rax, %xmm1
+; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload
+; SSE-NEXT: movq %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE-NEXT: popq %rax
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f16_to_4i32:
; VEX: # %bb.0:
; VEX-NEXT: pushq %rax
-; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; VEX-NEXT: vmovaps %xmm1, %xmm0
+; VEX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; VEX-NEXT: callq __gnu_f2h_ieee
; VEX-NEXT: movzwl %ax, %edi
; VEX-NEXT: callq __gnu_h2f_ieee
@@ -2188,27 +2258,27 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2
; VEX-NEXT: callq __gnu_f2h_ieee
; VEX-NEXT: movzwl %ax, %edi
; VEX-NEXT: callq __gnu_h2f_ieee
-; VEX-NEXT: vcvttss2si %xmm0, %eax
-; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; VEX-NEXT: vmovd %ecx, %xmm0
-; VEX-NEXT: vmovd %eax, %xmm1
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; VEX-NEXT: vcvttss2si %xmm0, %rax
+; VEX-NEXT: vmovq %rax, %xmm0
+; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload
+; VEX-NEXT: vmovq %rax, %xmm1
+; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; VEX-NEXT: popq %rax
; VEX-NEXT: retq
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vcvttss2si %xmm0, %eax
-; AVX512-NEXT: vcvttss2si %xmm1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm1, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm0, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX512-NEXT: retq
%cvt = fptosi <2 x half> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2225,31 +2295,32 @@ define <4 x i32> @fptosi_2f80_to_4i32(<2
; SSE-NEXT: orl $3072, %eax # imm = 0xC00
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp)
+; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: orl $3072, %eax # imm = 0xC00
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp)
+; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f80_to_4i32:
; AVX: # %bb.0:
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp)
-; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
+; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX-NEXT: retq
%cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2259,44 +2330,51 @@ define <4 x i32> @fptosi_2f80_to_4i32(<2
define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
; SSE-LABEL: fptosi_2f128_to_4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq %rcx, %r14
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: callq __fixtfsi
-; SSE-NEXT: movl %eax, %ebp
+; SSE-NEXT: subq $24, %rsp
+; SSE-NEXT: movq %rsi, %r14
+; SSE-NEXT: movq %rdi, %rbx
+; SSE-NEXT: movq %rdx, %rdi
+; SSE-NEXT: movq %rcx, %rsi
+; SSE-NEXT: callq __fixtfdi
+; SSE-NEXT: movq %rax, %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT: movq %rbx, %rdi
; SSE-NEXT: movq %r14, %rsi
-; SSE-NEXT: callq __fixtfsi
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd %ebp, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
+; SSE-NEXT: callq __fixtfdi
+; SSE-NEXT: movq %rax, %xmm0
+; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE-NEXT: addq $24, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f128_to_4i32:
; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rcx, %r14
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: callq __fixtfsi
-; AVX-NEXT: movl %eax, %ebp
+; AVX-NEXT: subq $24, %rsp
+; AVX-NEXT: movq %rsi, %r14
+; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: movq %rdx, %rdi
+; AVX-NEXT: movq %rcx, %rsi
+; AVX-NEXT: callq __fixtfdi
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: movq %rbx, %rdi
; AVX-NEXT: movq %r14, %rsi
-; AVX-NEXT: callq __fixtfsi
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vmovd %ebp, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT: callq __fixtfdi
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r14
-; AVX-NEXT: popq %rbp
; AVX-NEXT: retq
%cvt = fptosi <2 x fp128> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2307,16 +2385,41 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x
; SSE-LABEL: fptosi_2f32_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptosi_2f32_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptosi_2f32_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2325,14 +2428,41 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2
; SSE-LABEL: fptosi_2f32_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptosi_2f32_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
+; VEX-LABEL: fptosi_2f32_to_2i16:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i16>
ret <2 x i16> %cvt
}
@@ -2341,16 +2471,40 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x
; SSE-LABEL: fptoui_2f32_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptoui_2f32_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptoui_2f32_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2359,14 +2513,40 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2
; SSE-LABEL: fptoui_2f32_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptoui_2f32_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
+; VEX-LABEL: fptoui_2f32_to_2i16:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i16>
ret <2 x i16> %cvt
}
@@ -2375,16 +2555,41 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x
; SSE-LABEL: fptosi_2f64_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptosi_2f64_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptosi_2f64_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2393,14 +2598,41 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2
; SSE-LABEL: fptosi_2f64_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptosi_2f64_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
+; VEX-LABEL: fptosi_2f64_to_2i16:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i16>
ret <2 x i16> %cvt
}
@@ -2409,16 +2641,40 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x
; SSE-LABEL: fptoui_2f64_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptoui_2f64_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptoui_2f64_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2427,14 +2683,40 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2
; SSE-LABEL: fptoui_2f64_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
-; AVX-LABEL: fptoui_2f64_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
+; VEX-LABEL: fptoui_2f64_to_2i16:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i16>
ret <2 x i16> %cvt
}
@@ -2613,10 +2895,10 @@ define <16 x i8> @fptoui_16f32_to_16i8(<
; AVX1: # %bb.0:
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -2625,10 +2907,10 @@ define <16 x i8> @fptoui_16f32_to_16i8(<
; AVX2: # %bb.0:
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vec_insert-5.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_insert-5.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_insert-5.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_insert-5.ll Tue Aug 6 16:00:43 2019
@@ -19,7 +19,8 @@ define void @t1(i32 %a, x86_mmx* %P) no
; X64: # %bb.0:
; X64-NEXT: shll $12, %edi
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
%tmp12 = shl i32 %a, 12
Modified: llvm/trunk/test/CodeGen/X86/vec_insert-7.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_insert-7.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_insert-7.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_insert-7.ll Tue Aug 6 16:00:43 2019
@@ -8,15 +8,11 @@
define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X32-LABEL: mmx_movzl:
; X32: ## %bb.0:
-; X32-NEXT: subl $28, %esp
-; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movdqa (%esp), %xmm0
+; X32-NEXT: subl $12, %esp
; X32-NEXT: movl $32, %eax
-; X32-NEXT: pinsrd $0, %eax, %xmm0
-; X32-NEXT: pxor %xmm1, %xmm1
-; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; X32-NEXT: movdq2q %xmm1, %mm0
-; X32-NEXT: addl $28, %esp
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movdq2q %xmm0, %mm0
+; X32-NEXT: addl $12, %esp
; X32-NEXT: retl
;
; X64-LABEL: mmx_movzl:
Modified: llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll Tue Aug 6 16:00:43 2019
@@ -61,7 +61,13 @@ define void @t3() {
; X32-NEXT: movl L_g0$non_lazy_ptr, %eax
; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: pinsrw $0, (%eax), %xmm0
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-NEXT: movzwl (%eax), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-NEXT: movq %xmm0, (%ecx)
; X32-NEXT: retl
;
@@ -69,8 +75,10 @@ define void @t3() {
; X64: ## %bb.0:
; X64-NEXT: movq _g0@{{.*}}(%rip), %rax
; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pinsrw $0, (%rax), %xmm0
+; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-NEXT: movzwl (%rax), %eax
+; X64-NEXT: pinsrd $0, %eax, %xmm0
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
load i16, i16* @g0
Modified: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll Tue Aug 6 16:00:43 2019
@@ -3164,15 +3164,15 @@ define <2 x double> @sitofp_load_2i16_to
;
; SSE41-LABEL: sitofp_load_2i16_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
+; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sitofp_load_2i16_to_2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x i16>, <2 x i16> *%a
@@ -3193,17 +3193,15 @@ define <2 x double> @sitofp_load_2i8_to_
;
; SSE41-LABEL: sitofp_load_2i8_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sitofp_load_2i8_to_2f64:
; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x i8>, <2 x i8> *%a
@@ -3734,17 +3732,15 @@ define <2 x double> @uitofp_load_2i8_to_
;
; SSE41-LABEL: uitofp_load_2i8_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: uitofp_load_2i8_to_2f64:
; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x i8>, <2 x i8> *%a
@@ -5581,12 +5577,14 @@ define void @aggregate_sitofp_8i16_to_8f
; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq 24(%rdi), %rax
-; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0
-; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
+; SSE41-NEXT: movdqu 8(%rdi), %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, 16(%rax)
-; SSE41-NEXT: movaps %xmm1, (%rax)
+; SSE41-NEXT: movaps %xmm0, (%rax)
+; SSE41-NEXT: movaps %xmm1, 16(%rax)
; SSE41-NEXT: retq
;
; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
Modified: llvm/trunk/test/CodeGen/X86/vec_saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_saddo.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_saddo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_saddo.ll Tue Aug 6 16:00:43 2019
@@ -49,73 +49,134 @@ define <1 x i32> @saddo_v1i32(<1 x i32>
}
define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
-; SSE-LABEL: saddo_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movq %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: saddo_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: saddo_v2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: paddq %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: saddo_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: paddq %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: retq
;
; AVX1-LABEL: saddo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: saddo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: saddo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -1138,7 +1199,6 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: saddo_v2i64:
@@ -1185,7 +1245,6 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pandn %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: saddo_v2i64:
@@ -1221,14 +1280,13 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT: pandn %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm0
+; SSE41-NEXT: pandn %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: saddo_v2i64:
@@ -1245,7 +1303,6 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
@@ -1263,7 +1320,6 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
@@ -1273,13 +1329,13 @@ define <2 x i32> @saddo_v2i64(<2 x i64>
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -1566,44 +1622,44 @@ define <2 x i32> @saddo_v2i128(<2 x i128
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: testq %r9, %r9
-; SSE2-NEXT: setns %al
-; SSE2-NEXT: testq %rsi, %rsi
-; SSE2-NEXT: setns %bl
-; SSE2-NEXT: cmpb %al, %bl
-; SSE2-NEXT: sete %bpl
-; SSE2-NEXT: addq %r8, %rdi
-; SSE2-NEXT: adcq %r9, %rsi
-; SSE2-NEXT: setns %al
-; SSE2-NEXT: cmpb %al, %bl
-; SSE2-NEXT: setne %al
-; SSE2-NEXT: andb %bpl, %al
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq %rcx, %rbp
-; SSE2-NEXT: adcq %r10, %rbp
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: adcq %r11, %rax
; SSE2-NEXT: setns %bl
; SSE2-NEXT: testq %rcx, %rcx
; SSE2-NEXT: setns %cl
; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: setne %r8b
-; SSE2-NEXT: testq %r10, %r10
+; SSE2-NEXT: setne %bpl
+; SSE2-NEXT: testq %r11, %r11
; SSE2-NEXT: setns %bl
; SSE2-NEXT: cmpb %bl, %cl
; SSE2-NEXT: sete %cl
-; SSE2-NEXT: andb %r8b, %cl
+; SSE2-NEXT: andb %bpl, %cl
+; SSE2-NEXT: movzbl %cl, %ebp
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: setns %cl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: sete %r11b
+; SSE2-NEXT: addq %r8, %rdi
+; SSE2-NEXT: adcq %r9, %rsi
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: andb %r11b, %cl
; SSE2-NEXT: movzbl %cl, %ecx
-; SSE2-NEXT: negl %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: movzbl %al, %eax
-; SSE2-NEXT: negl %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rdx, 16(%r11)
-; SSE2-NEXT: movq %rdi, (%r11)
-; SSE2-NEXT: movq %rbp, 24(%r11)
-; SSE2-NEXT: movq %rsi, 8(%r11)
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT: movq %rdx, 16(%r10)
+; SSE2-NEXT: movq %rdi, (%r10)
+; SSE2-NEXT: movq %rax, 24(%r10)
+; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
@@ -1612,44 +1668,44 @@ define <2 x i32> @saddo_v2i128(<2 x i128
; SSSE3: # %bb.0:
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSSE3-NEXT: testq %r9, %r9
-; SSSE3-NEXT: setns %al
-; SSSE3-NEXT: testq %rsi, %rsi
-; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: cmpb %al, %bl
-; SSSE3-NEXT: sete %bpl
-; SSSE3-NEXT: addq %r8, %rdi
-; SSSE3-NEXT: adcq %r9, %rsi
-; SSSE3-NEXT: setns %al
-; SSSE3-NEXT: cmpb %al, %bl
-; SSSE3-NEXT: setne %al
-; SSSE3-NEXT: andb %bpl, %al
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; SSSE3-NEXT: movq %rcx, %rbp
-; SSSE3-NEXT: adcq %r10, %rbp
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: adcq %r11, %rax
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: testq %rcx, %rcx
; SSSE3-NEXT: setns %cl
; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: setne %r8b
-; SSSE3-NEXT: testq %r10, %r10
+; SSSE3-NEXT: setne %bpl
+; SSSE3-NEXT: testq %r11, %r11
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: cmpb %bl, %cl
; SSSE3-NEXT: sete %cl
-; SSSE3-NEXT: andb %r8b, %cl
+; SSSE3-NEXT: andb %bpl, %cl
+; SSSE3-NEXT: movzbl %cl, %ebp
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: testq %rsi, %rsi
+; SSSE3-NEXT: setns %cl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: sete %r11b
+; SSSE3-NEXT: addq %r8, %rdi
+; SSSE3-NEXT: adcq %r9, %rsi
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: andb %r11b, %cl
; SSSE3-NEXT: movzbl %cl, %ecx
-; SSSE3-NEXT: negl %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movzbl %al, %eax
-; SSSE3-NEXT: negl %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rdx, 16(%r11)
-; SSSE3-NEXT: movq %rdi, (%r11)
-; SSSE3-NEXT: movq %rbp, 24(%r11)
-; SSSE3-NEXT: movq %rsi, 8(%r11)
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT: movq %rdx, 16(%r10)
+; SSSE3-NEXT: movq %rdi, (%r10)
+; SSSE3-NEXT: movq %rax, 24(%r10)
+; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %rbp
; SSSE3-NEXT: retq
@@ -1658,43 +1714,44 @@ define <2 x i32> @saddo_v2i128(<2 x i128
; SSE41: # %bb.0:
; SSE41-NEXT: pushq %rbp
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE41-NEXT: testq %r9, %r9
-; SSE41-NEXT: setns %al
-; SSE41-NEXT: testq %rsi, %rsi
-; SSE41-NEXT: setns %bl
-; SSE41-NEXT: cmpb %al, %bl
-; SSE41-NEXT: sete %bpl
-; SSE41-NEXT: addq %r8, %rdi
-; SSE41-NEXT: adcq %r9, %rsi
-; SSE41-NEXT: setns %al
-; SSE41-NEXT: cmpb %al, %bl
-; SSE41-NEXT: setne %al
-; SSE41-NEXT: andb %bpl, %al
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movq %rcx, %rbp
-; SSE41-NEXT: adcq %r10, %rbp
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: adcq %r11, %rax
; SSE41-NEXT: setns %bl
; SSE41-NEXT: testq %rcx, %rcx
; SSE41-NEXT: setns %cl
; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: setne %r8b
-; SSE41-NEXT: testq %r10, %r10
+; SSE41-NEXT: setne %bpl
+; SSE41-NEXT: testq %r11, %r11
; SSE41-NEXT: setns %bl
; SSE41-NEXT: cmpb %bl, %cl
; SSE41-NEXT: sete %cl
-; SSE41-NEXT: andb %r8b, %cl
+; SSE41-NEXT: andb %bpl, %cl
+; SSE41-NEXT: movzbl %cl, %ebp
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: testq %rsi, %rsi
+; SSE41-NEXT: setns %cl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: sete %r11b
+; SSE41-NEXT: addq %r8, %rdi
+; SSE41-NEXT: adcq %r9, %rsi
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: andb %r11b, %cl
; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: negl %ecx
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: negl %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r11)
-; SSE41-NEXT: movq %rdi, (%r11)
-; SSE41-NEXT: movq %rbp, 24(%r11)
-; SSE41-NEXT: movq %rsi, 8(%r11)
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r10)
+; SSE41-NEXT: movq %rdi, (%r10)
+; SSE41-NEXT: movq %rax, 24(%r10)
+; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %rbp
; SSE41-NEXT: retq
@@ -1703,43 +1760,44 @@ define <2 x i32> @saddo_v2i128(<2 x i128
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT: testq %r9, %r9
-; AVX1-NEXT: setns %al
-; AVX1-NEXT: testq %rsi, %rsi
-; AVX1-NEXT: setns %bl
-; AVX1-NEXT: cmpb %al, %bl
-; AVX1-NEXT: sete %bpl
-; AVX1-NEXT: addq %r8, %rdi
-; AVX1-NEXT: adcq %r9, %rsi
-; AVX1-NEXT: setns %al
-; AVX1-NEXT: cmpb %al, %bl
-; AVX1-NEXT: setne %al
-; AVX1-NEXT: andb %bpl, %al
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: movq %rcx, %rbp
-; AVX1-NEXT: adcq %r10, %rbp
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: adcq %r11, %rax
; AVX1-NEXT: setns %bl
; AVX1-NEXT: testq %rcx, %rcx
; AVX1-NEXT: setns %cl
; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: setne %r8b
-; AVX1-NEXT: testq %r10, %r10
+; AVX1-NEXT: setne %bpl
+; AVX1-NEXT: testq %r11, %r11
; AVX1-NEXT: setns %bl
; AVX1-NEXT: cmpb %bl, %cl
; AVX1-NEXT: sete %cl
-; AVX1-NEXT: andb %r8b, %cl
+; AVX1-NEXT: andb %bpl, %cl
+; AVX1-NEXT: movzbl %cl, %ebp
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setns %cl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: sete %r11b
+; AVX1-NEXT: addq %r8, %rdi
+; AVX1-NEXT: adcq %r9, %rsi
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: andb %r11b, %cl
; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: negl %ecx
-; AVX1-NEXT: movzbl %al, %eax
-; AVX1-NEXT: negl %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, 16(%r11)
-; AVX1-NEXT: movq %rdi, (%r11)
-; AVX1-NEXT: movq %rbp, 24(%r11)
-; AVX1-NEXT: movq %rsi, 8(%r11)
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r10)
+; AVX1-NEXT: movq %rdi, (%r10)
+; AVX1-NEXT: movq %rax, 24(%r10)
+; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
@@ -1748,43 +1806,44 @@ define <2 x i32> @saddo_v2i128(<2 x i128
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: testq %r9, %r9
-; AVX2-NEXT: setns %al
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: setns %bl
-; AVX2-NEXT: cmpb %al, %bl
-; AVX2-NEXT: sete %bpl
-; AVX2-NEXT: addq %r8, %rdi
-; AVX2-NEXT: adcq %r9, %rsi
-; AVX2-NEXT: setns %al
-; AVX2-NEXT: cmpb %al, %bl
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: andb %bpl, %al
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq %rcx, %rbp
-; AVX2-NEXT: adcq %r10, %rbp
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: adcq %r11, %rax
; AVX2-NEXT: setns %bl
; AVX2-NEXT: testq %rcx, %rcx
; AVX2-NEXT: setns %cl
; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: setne %r8b
-; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: setne %bpl
+; AVX2-NEXT: testq %r11, %r11
; AVX2-NEXT: setns %bl
; AVX2-NEXT: cmpb %bl, %cl
; AVX2-NEXT: sete %cl
-; AVX2-NEXT: andb %r8b, %cl
+; AVX2-NEXT: andb %bpl, %cl
+; AVX2-NEXT: movzbl %cl, %ebp
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setns %cl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: sete %r11b
+; AVX2-NEXT: addq %r8, %rdi
+; AVX2-NEXT: adcq %r9, %rsi
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: andb %r11b, %cl
; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: negl %ecx
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, 16(%r11)
-; AVX2-NEXT: movq %rdi, (%r11)
-; AVX2-NEXT: movq %rbp, 24(%r11)
-; AVX2-NEXT: movq %rsi, 8(%r11)
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r10)
+; AVX2-NEXT: movq %rdi, (%r10)
+; AVX2-NEXT: movq %rax, 24(%r10)
+; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
@@ -1825,12 +1884,12 @@ define <2 x i32> @saddo_v2i128(<2 x i128
; AVX512-NEXT: andl $1, %ecx
; AVX512-NEXT: kmovw %ecx, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %r14, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vec_smulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_smulo.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_smulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_smulo.ll Tue Aug 6 16:00:43 2019
@@ -51,123 +51,238 @@ define <1 x i32> @smulo_v1i32(<1 x i32>
define <2 x i32> @smulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: smulo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: psubd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: movq %xmm1, %r8
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: movq %xmm2, %rdx
+; SSE2-NEXT: movq %xmm1, %rsi
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: imulq %rdx, %rsi
+; SSE2-NEXT: movq $-1, %r9
+; SSE2-NEXT: movl $0, %edx
+; SSE2-NEXT: cmovoq %r9, %rdx
+; SSE2-NEXT: movq %rsi, %xmm1
+; SSE2-NEXT: imulq %r8, %rcx
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movq %rdx, %xmm0
+; SSE2-NEXT: cmovoq %r9, %rax
+; SSE2-NEXT: movq %rax, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movq %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: smulo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: paddd %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSSE3-NEXT: psubd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movq %xmm1, %r8
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movq %xmm0, %rcx
+; SSSE3-NEXT: movq %xmm2, %rdx
+; SSSE3-NEXT: movq %xmm1, %rsi
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: imulq %rdx, %rsi
+; SSSE3-NEXT: movq $-1, %r9
+; SSSE3-NEXT: movl $0, %edx
+; SSSE3-NEXT: cmovoq %r9, %rdx
+; SSSE3-NEXT: movq %rsi, %xmm1
+; SSSE3-NEXT: imulq %r8, %rcx
+; SSSE3-NEXT: movq %rcx, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movq %rdx, %xmm0
+; SSSE3-NEXT: cmovoq %r9, %rax
+; SSSE3-NEXT: movq %rax, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT: movq %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: smulo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pmuldq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movq %xmm2, %r8
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: movq %xmm1, %rcx
+; SSE41-NEXT: pextrq $1, %xmm2, %rdx
+; SSE41-NEXT: pextrq $1, %xmm1, %rsi
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: imulq %rdx, %rsi
+; SSE41-NEXT: movq $-1, %r9
+; SSE41-NEXT: movl $0, %edx
+; SSE41-NEXT: cmovoq %r9, %rdx
+; SSE41-NEXT: movq %rsi, %xmm0
+; SSE41-NEXT: imulq %r8, %rcx
+; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllq $32, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm2
+; SSE41-NEXT: movq %rdx, %xmm3
+; SSE41-NEXT: cmovoq %r9, %rax
+; SSE41-NEXT: movq %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT: movq %xmm1, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: smulo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vmovq %xmm1, %r8
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: imulq %rdx, %rsi
+; AVX1-NEXT: movq $-1, %r9
+; AVX1-NEXT: movl $0, %edx
+; AVX1-NEXT: cmovoq %r9, %rdx
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: imulq %r8, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: cmovoq %r9, %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: smulo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: imulq %rdx, %rsi
+; AVX2-NEXT: movq $-1, %r9
+; AVX2-NEXT: movl $0, %edx
+; AVX2-NEXT: cmovoq %r9, %rdx
+; AVX2-NEXT: vmovq %rsi, %xmm0
+; AVX2-NEXT: imulq %r8, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: cmovoq %r9, %rax
+; AVX2-NEXT: vmovq %rax, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: smulo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
-; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rax
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: imulq %rdx, %rsi
+; AVX512-NEXT: seto %dl
+; AVX512-NEXT: vmovq %rsi, %xmm0
+; AVX512-NEXT: imulq %rax, %rcx
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpneqq %xmm0, %xmm1, %k0
+; AVX512-NEXT: kmovd %edx, %k1
+; AVX512-NEXT: kshiftlw $1, %k1, %k1
+; AVX512-NEXT: seto %al
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: kmovw %eax, %k2
+; AVX512-NEXT: korw %k1, %k2, %k1
+; AVX512-NEXT: korw %k1, %k0, %k1
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.smul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -1622,15 +1737,14 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
; SSE2-NEXT: movq $-1, %r9
; SSE2-NEXT: movl $0, %edx
; SSE2-NEXT: cmovoq %r9, %rdx
-; SSE2-NEXT: movq %rsi, %xmm1
-; SSE2-NEXT: imulq %r8, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movq %rdx, %xmm0
+; SSE2-NEXT: imulq %r8, %rcx
; SSE2-NEXT: cmovoq %r9, %rax
-; SSE2-NEXT: movq %rax, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movq %rsi, %xmm1
+; SSE2-NEXT: movq %rcx, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: movdqa %xmm1, (%rdi)
; SSE2-NEXT: retq
;
@@ -1647,15 +1761,14 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
; SSSE3-NEXT: movq $-1, %r9
; SSSE3-NEXT: movl $0, %edx
; SSSE3-NEXT: cmovoq %r9, %rdx
-; SSSE3-NEXT: movq %rsi, %xmm1
-; SSSE3-NEXT: imulq %r8, %rcx
-; SSSE3-NEXT: movq %rcx, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: movq %rdx, %xmm0
+; SSSE3-NEXT: imulq %r8, %rcx
; SSSE3-NEXT: cmovoq %r9, %rax
-; SSSE3-NEXT: movq %rax, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movq %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movq %rsi, %xmm1
+; SSSE3-NEXT: movq %rcx, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: movdqa %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
@@ -1670,16 +1783,15 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
; SSE41-NEXT: movq $-1, %r9
; SSE41-NEXT: movl $0, %edx
; SSE41-NEXT: cmovoq %r9, %rdx
-; SSE41-NEXT: movq %rsi, %xmm0
+; SSE41-NEXT: movq %rdx, %xmm1
; SSE41-NEXT: imulq %r8, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: cmovoq %r9, %rax
-; SSE41-NEXT: movq %rax, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE41-NEXT: movdqa %xmm1, (%rdi)
+; SSE41-NEXT: movq %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: movq %rsi, %xmm1
+; SSE41-NEXT: movq %rcx, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE41-NEXT: movdqa %xmm2, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: smulo_v2i64:
@@ -1693,15 +1805,14 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
; AVX1-NEXT: movq $-1, %r9
; AVX1-NEXT: movl $0, %edx
; AVX1-NEXT: cmovoq %r9, %rdx
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: imulq %r8, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovq %rdx, %xmm0
+; AVX1-NEXT: imulq %r8, %rcx
; AVX1-NEXT: cmovoq %r9, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %rsi, %xmm1
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
@@ -1716,41 +1827,38 @@ define <2 x i32> @smulo_v2i64(<2 x i64>
; AVX2-NEXT: movq $-1, %r9
; AVX2-NEXT: movl $0, %edx
; AVX2-NEXT: cmovoq %r9, %rdx
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: imulq %r8, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX2-NEXT: vmovq %rdx, %xmm0
+; AVX2-NEXT: imulq %r8, %rcx
; AVX2-NEXT: cmovoq %r9, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovq %rsi, %xmm1
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: smulo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT: vmovq %xmm1, %rdx
-; AVX512-NEXT: vmovq %xmm0, %rsi
+; AVX512-NEXT: vmovq %xmm1, %rax
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
; AVX512-NEXT: imulq %rdx, %rsi
; AVX512-NEXT: seto %dl
+; AVX512-NEXT: kmovd %edx, %k0
+; AVX512-NEXT: kshiftlw $1, %k0, %k0
; AVX512-NEXT: imulq %rax, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm0
-; AVX512-NEXT: vmovq %rsi, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-NEXT: seto %al
-; AVX512-NEXT: kmovd %eax, %k0
-; AVX512-NEXT: kmovd %edx, %k1
-; AVX512-NEXT: kshiftrw $1, %k1, %k2
-; AVX512-NEXT: kxorw %k0, %k2, %k0
-; AVX512-NEXT: kshiftlw $15, %k0, %k0
-; AVX512-NEXT: kshiftrw $14, %k0, %k0
-; AVX512-NEXT: kxorw %k0, %k1, %k1
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vmovq %rsi, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -2361,17 +2469,20 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; SSE2-NEXT: movq %r12, %rcx
; SSE2-NEXT: callq __muloti4
; SSE2-NEXT: xorl %ecx, %ecx
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: sbbl %esi, %esi
-; SSE2-NEXT: movd %esi, %xmm1
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: sbbl %ecx, %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: setne %sil
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-NEXT: movq %rdx, 24(%r15)
; SSE2-NEXT: movq %rax, 16(%r15)
; SSE2-NEXT: movq %rbp, 8(%r15)
; SSE2-NEXT: movq %r13, (%r15)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: addq $24, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
@@ -2410,17 +2521,20 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; SSSE3-NEXT: movq %r12, %rcx
; SSSE3-NEXT: callq __muloti4
; SSSE3-NEXT: xorl %ecx, %ecx
-; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: sbbl %esi, %esi
-; SSSE3-NEXT: movd %esi, %xmm1
-; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: sbbl %ecx, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: xorl %esi, %esi
+; SSSE3-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; SSSE3-NEXT: setne %sil
+; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSSE3-NEXT: movq %rdx, 24(%r15)
; SSSE3-NEXT: movq %rax, 16(%r15)
; SSSE3-NEXT: movq %rbp, 8(%r15)
; SSSE3-NEXT: movq %r13, (%r15)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: addq $24, %rsp
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
@@ -2459,16 +2573,20 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; SSE41-NEXT: movq %r12, %rcx
; SSE41-NEXT: callq __muloti4
; SSE41-NEXT: xorl %ecx, %ecx
-; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: sbbl %esi, %esi
-; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: sbbl %ecx, %ecx
-; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrd $1, %esi, %xmm0
+; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: xorl %esi, %esi
+; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; SSE41-NEXT: setne %sil
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pinsrb $8, %ecx, %xmm0
; SSE41-NEXT: movq %rdx, 24(%r15)
; SSE41-NEXT: movq %rax, 16(%r15)
; SSE41-NEXT: movq %rbp, 8(%r15)
; SSE41-NEXT: movq %r13, (%r15)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: addq $24, %rsp
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
@@ -2507,16 +2625,20 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; AVX1-NEXT: movq %r12, %rcx
; AVX1-NEXT: callq __muloti4
; AVX1-NEXT: xorl %ecx, %ecx
-; AVX1-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: sbbl %esi, %esi
-; AVX1-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: sbbl %ecx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: xorl %esi, %esi
+; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: setne %sil
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; AVX1-NEXT: movq %rdx, 24(%r15)
; AVX1-NEXT: movq %rax, 16(%r15)
; AVX1-NEXT: movq %rbp, 8(%r15)
; AVX1-NEXT: movq %r13, (%r15)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: addq $24, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
@@ -2555,16 +2677,20 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; AVX2-NEXT: movq %r12, %rcx
; AVX2-NEXT: callq __muloti4
; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: sbbl %esi, %esi
-; AVX2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: sbbl %ecx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: setne %sil
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; AVX2-NEXT: movq %rdx, 24(%r15)
; AVX2-NEXT: movq %rax, 16(%r15)
; AVX2-NEXT: movq %rbp, 8(%r15)
; AVX2-NEXT: movq %r13, (%r15)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: addq $24, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
@@ -2584,23 +2710,23 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: subq $24, %rsp
; AVX512-NEXT: movq %r8, %rax
-; AVX512-NEXT: movq %rcx, %r15
+; AVX512-NEXT: movq %rcx, %r14
; AVX512-NEXT: movq %rdx, %rbx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8
; AVX512-NEXT: movq %rax, %rdx
; AVX512-NEXT: movq %r9, %rcx
; AVX512-NEXT: callq __muloti4
-; AVX512-NEXT: movq %rax, %r14
+; AVX512-NEXT: movq %rax, %r13
; AVX512-NEXT: movq %rdx, %rbp
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8
; AVX512-NEXT: movq %rbx, %rdi
-; AVX512-NEXT: movq %r15, %rsi
+; AVX512-NEXT: movq %r14, %rsi
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: movq %r13, %rcx
+; AVX512-NEXT: movq %r12, %rcx
; AVX512-NEXT: callq __muloti4
; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: setne %cl
@@ -2611,12 +2737,12 @@ define <2 x i32> @smulo_v2i128(<2 x i128
; AVX512-NEXT: andl $1, %ecx
; AVX512-NEXT: kmovw %ecx, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: movq %rdx, 24(%r15)
+; AVX512-NEXT: movq %rax, 16(%r15)
+; AVX512-NEXT: movq %rbp, 8(%r15)
+; AVX512-NEXT: movq %r13, (%r15)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: movq %rdx, 24(%r12)
-; AVX512-NEXT: movq %rax, 16(%r12)
-; AVX512-NEXT: movq %rbp, 8(%r12)
-; AVX512-NEXT: movq %r14, (%r12)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: addq $24, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
Modified: llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ssubo.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ssubo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_ssubo.ll Tue Aug 6 16:00:43 2019
@@ -49,76 +49,134 @@ define <1 x i32> @ssubo_v1i32(<1 x i32>
}
define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
-; SSE-LABEL: ssubo_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movq %xmm0, (%rdi)
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: ssubo_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllq $32, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: psubq %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ssubo_v2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: psllq $32, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: psubq %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psllq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: ssubo_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllq $32, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: psubq %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: retq
;
; AVX1-LABEL: ssubo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
-; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: ssubo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
-; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
-; AVX512-NEXT: kxorw %k2, %k1, %k1
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -1143,38 +1201,38 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v2i64:
@@ -1191,38 +1249,38 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
+; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v2i64:
@@ -1235,12 +1293,12 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm6, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm2, %xmm5
; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
@@ -1250,8 +1308,8 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; SSE41-NEXT: pand %xmm6, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm0, (%rdi)
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -1262,11 +1320,11 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: pandn %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pandn %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: ssubo_v2i64:
@@ -1284,7 +1342,6 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
@@ -1303,7 +1360,6 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
@@ -1313,13 +1369,13 @@ define <2 x i32> @ssubo_v2i64(<2 x i64>
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -1605,44 +1661,44 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: testq %r9, %r9
-; SSE2-NEXT: setns %al
-; SSE2-NEXT: testq %rsi, %rsi
-; SSE2-NEXT: setns %bl
-; SSE2-NEXT: cmpb %al, %bl
-; SSE2-NEXT: setne %bpl
-; SSE2-NEXT: subq %r8, %rdi
-; SSE2-NEXT: sbbq %r9, %rsi
-; SSE2-NEXT: setns %al
-; SSE2-NEXT: cmpb %al, %bl
-; SSE2-NEXT: setne %al
-; SSE2-NEXT: andb %bpl, %al
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq %rcx, %rbp
-; SSE2-NEXT: sbbq %r10, %rbp
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: sbbq %r11, %rax
; SSE2-NEXT: setns %bl
; SSE2-NEXT: testq %rcx, %rcx
; SSE2-NEXT: setns %cl
; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: setne %r8b
-; SSE2-NEXT: testq %r10, %r10
+; SSE2-NEXT: setne %bpl
+; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: andb %bpl, %cl
+; SSE2-NEXT: movzbl %cl, %ebp
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: setns %cl
+; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: setne %r11b
+; SSE2-NEXT: subq %r8, %rdi
+; SSE2-NEXT: sbbq %r9, %rsi
; SSE2-NEXT: setns %bl
; SSE2-NEXT: cmpb %bl, %cl
; SSE2-NEXT: setne %cl
-; SSE2-NEXT: andb %r8b, %cl
+; SSE2-NEXT: andb %r11b, %cl
; SSE2-NEXT: movzbl %cl, %ecx
-; SSE2-NEXT: negl %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: movzbl %al, %eax
-; SSE2-NEXT: negl %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rdx, 16(%r11)
-; SSE2-NEXT: movq %rdi, (%r11)
-; SSE2-NEXT: movq %rbp, 24(%r11)
-; SSE2-NEXT: movq %rsi, 8(%r11)
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT: movq %rdx, 16(%r10)
+; SSE2-NEXT: movq %rdi, (%r10)
+; SSE2-NEXT: movq %rax, 24(%r10)
+; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
@@ -1651,44 +1707,44 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
; SSSE3: # %bb.0:
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSSE3-NEXT: testq %r9, %r9
-; SSSE3-NEXT: setns %al
-; SSSE3-NEXT: testq %rsi, %rsi
-; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: cmpb %al, %bl
-; SSSE3-NEXT: setne %bpl
-; SSSE3-NEXT: subq %r8, %rdi
-; SSSE3-NEXT: sbbq %r9, %rsi
-; SSSE3-NEXT: setns %al
-; SSSE3-NEXT: cmpb %al, %bl
-; SSSE3-NEXT: setne %al
-; SSSE3-NEXT: andb %bpl, %al
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; SSSE3-NEXT: movq %rcx, %rbp
-; SSSE3-NEXT: sbbq %r10, %rbp
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: sbbq %r11, %rax
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: testq %rcx, %rcx
; SSSE3-NEXT: setns %cl
; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: setne %r8b
-; SSSE3-NEXT: testq %r10, %r10
+; SSSE3-NEXT: setne %bpl
+; SSSE3-NEXT: testq %r11, %r11
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: andb %bpl, %cl
+; SSSE3-NEXT: movzbl %cl, %ebp
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: testq %rsi, %rsi
+; SSSE3-NEXT: setns %cl
+; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: setne %r11b
+; SSSE3-NEXT: subq %r8, %rdi
+; SSSE3-NEXT: sbbq %r9, %rsi
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: cmpb %bl, %cl
; SSSE3-NEXT: setne %cl
-; SSSE3-NEXT: andb %r8b, %cl
+; SSSE3-NEXT: andb %r11b, %cl
; SSSE3-NEXT: movzbl %cl, %ecx
-; SSSE3-NEXT: negl %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movzbl %al, %eax
-; SSSE3-NEXT: negl %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rdx, 16(%r11)
-; SSSE3-NEXT: movq %rdi, (%r11)
-; SSSE3-NEXT: movq %rbp, 24(%r11)
-; SSSE3-NEXT: movq %rsi, 8(%r11)
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT: movq %rdx, 16(%r10)
+; SSSE3-NEXT: movq %rdi, (%r10)
+; SSSE3-NEXT: movq %rax, 24(%r10)
+; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %rbp
; SSSE3-NEXT: retq
@@ -1697,43 +1753,44 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
; SSE41: # %bb.0:
; SSE41-NEXT: pushq %rbp
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE41-NEXT: testq %r9, %r9
-; SSE41-NEXT: setns %al
-; SSE41-NEXT: testq %rsi, %rsi
-; SSE41-NEXT: setns %bl
-; SSE41-NEXT: cmpb %al, %bl
-; SSE41-NEXT: setne %bpl
-; SSE41-NEXT: subq %r8, %rdi
-; SSE41-NEXT: sbbq %r9, %rsi
-; SSE41-NEXT: setns %al
-; SSE41-NEXT: cmpb %al, %bl
-; SSE41-NEXT: setne %al
-; SSE41-NEXT: andb %bpl, %al
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movq %rcx, %rbp
-; SSE41-NEXT: sbbq %r10, %rbp
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: sbbq %r11, %rax
; SSE41-NEXT: setns %bl
; SSE41-NEXT: testq %rcx, %rcx
; SSE41-NEXT: setns %cl
; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: setne %r8b
-; SSE41-NEXT: testq %r10, %r10
+; SSE41-NEXT: setne %bpl
+; SSE41-NEXT: testq %r11, %r11
; SSE41-NEXT: setns %bl
; SSE41-NEXT: cmpb %bl, %cl
; SSE41-NEXT: setne %cl
-; SSE41-NEXT: andb %r8b, %cl
+; SSE41-NEXT: andb %bpl, %cl
+; SSE41-NEXT: movzbl %cl, %ebp
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: testq %rsi, %rsi
+; SSE41-NEXT: setns %cl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %r11b
+; SSE41-NEXT: subq %r8, %rdi
+; SSE41-NEXT: sbbq %r9, %rsi
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: andb %r11b, %cl
; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: negl %ecx
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: negl %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r11)
-; SSE41-NEXT: movq %rdi, (%r11)
-; SSE41-NEXT: movq %rbp, 24(%r11)
-; SSE41-NEXT: movq %rsi, 8(%r11)
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r10)
+; SSE41-NEXT: movq %rdi, (%r10)
+; SSE41-NEXT: movq %rax, 24(%r10)
+; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %rbp
; SSE41-NEXT: retq
@@ -1742,43 +1799,44 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT: testq %r9, %r9
-; AVX1-NEXT: setns %al
-; AVX1-NEXT: testq %rsi, %rsi
-; AVX1-NEXT: setns %bl
-; AVX1-NEXT: cmpb %al, %bl
-; AVX1-NEXT: setne %bpl
-; AVX1-NEXT: subq %r8, %rdi
-; AVX1-NEXT: sbbq %r9, %rsi
-; AVX1-NEXT: setns %al
-; AVX1-NEXT: cmpb %al, %bl
-; AVX1-NEXT: setne %al
-; AVX1-NEXT: andb %bpl, %al
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: movq %rcx, %rbp
-; AVX1-NEXT: sbbq %r10, %rbp
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: sbbq %r11, %rax
; AVX1-NEXT: setns %bl
; AVX1-NEXT: testq %rcx, %rcx
; AVX1-NEXT: setns %cl
; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: setne %r8b
-; AVX1-NEXT: testq %r10, %r10
+; AVX1-NEXT: setne %bpl
+; AVX1-NEXT: testq %r11, %r11
; AVX1-NEXT: setns %bl
; AVX1-NEXT: cmpb %bl, %cl
; AVX1-NEXT: setne %cl
-; AVX1-NEXT: andb %r8b, %cl
+; AVX1-NEXT: andb %bpl, %cl
+; AVX1-NEXT: movzbl %cl, %ebp
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setns %cl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %r11b
+; AVX1-NEXT: subq %r8, %rdi
+; AVX1-NEXT: sbbq %r9, %rsi
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: andb %r11b, %cl
; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: negl %ecx
-; AVX1-NEXT: movzbl %al, %eax
-; AVX1-NEXT: negl %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, 16(%r11)
-; AVX1-NEXT: movq %rdi, (%r11)
-; AVX1-NEXT: movq %rbp, 24(%r11)
-; AVX1-NEXT: movq %rsi, 8(%r11)
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r10)
+; AVX1-NEXT: movq %rdi, (%r10)
+; AVX1-NEXT: movq %rax, 24(%r10)
+; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
@@ -1787,43 +1845,44 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: testq %r9, %r9
-; AVX2-NEXT: setns %al
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: setns %bl
-; AVX2-NEXT: cmpb %al, %bl
-; AVX2-NEXT: setne %bpl
-; AVX2-NEXT: subq %r8, %rdi
-; AVX2-NEXT: sbbq %r9, %rsi
-; AVX2-NEXT: setns %al
-; AVX2-NEXT: cmpb %al, %bl
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: andb %bpl, %al
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq %rcx, %rbp
-; AVX2-NEXT: sbbq %r10, %rbp
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: sbbq %r11, %rax
; AVX2-NEXT: setns %bl
; AVX2-NEXT: testq %rcx, %rcx
; AVX2-NEXT: setns %cl
; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: setne %r8b
-; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: setne %bpl
+; AVX2-NEXT: testq %r11, %r11
; AVX2-NEXT: setns %bl
; AVX2-NEXT: cmpb %bl, %cl
; AVX2-NEXT: setne %cl
-; AVX2-NEXT: andb %r8b, %cl
+; AVX2-NEXT: andb %bpl, %cl
+; AVX2-NEXT: movzbl %cl, %ebp
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setns %cl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %r11b
+; AVX2-NEXT: subq %r8, %rdi
+; AVX2-NEXT: sbbq %r9, %rsi
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: andb %r11b, %cl
; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: negl %ecx
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, 16(%r11)
-; AVX2-NEXT: movq %rdi, (%r11)
-; AVX2-NEXT: movq %rbp, 24(%r11)
-; AVX2-NEXT: movq %rsi, 8(%r11)
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r10)
+; AVX2-NEXT: movq %rdi, (%r10)
+; AVX2-NEXT: movq %rax, 24(%r10)
+; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
@@ -1864,12 +1923,12 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
; AVX512-NEXT: andl $1, %ecx
; AVX512-NEXT: kmovw %ecx, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %r14, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vec_uaddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_uaddo.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_uaddo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_uaddo.ll Tue Aug 6 16:00:43 2019
@@ -47,61 +47,91 @@ define <1 x i32> @uaddo_v1i32(<1 x i32>
define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: uaddo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: paddq %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: paddq %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: pcmpeqq %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -892,12 +922,12 @@ define <2 x i32> @uaddo_v2i64(<2 x i64>
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
@@ -908,7 +938,6 @@ define <2 x i32> @uaddo_v2i64(<2 x i64>
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
@@ -919,7 +948,6 @@ define <2 x i32> @uaddo_v2i64(<2 x i64>
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
@@ -927,9 +955,9 @@ define <2 x i32> @uaddo_v2i64(<2 x i64>
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -1197,17 +1225,21 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: sbbl %eax, %eax
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %r11d
; SSE2-NEXT: addq %r8, %rdi
; SSE2-NEXT: adcq %r9, %rsi
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: sbbl %eax, %eax
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pinsrw $4, %r11d, %xmm0
; SSE2-NEXT: movq %rdx, 16(%r10)
; SSE2-NEXT: movq %rdi, (%r10)
; SSE2-NEXT: movq %rcx, 24(%r10)
; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v2i128:
@@ -1215,17 +1247,21 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: sbbl %eax, %eax
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %r11d
; SSSE3-NEXT: addq %r8, %rdi
; SSSE3-NEXT: adcq %r9, %rsi
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: sbbl %eax, %eax
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0
; SSSE3-NEXT: movq %rdx, 16(%r10)
; SSSE3-NEXT: movq %rdi, (%r10)
; SSSE3-NEXT: movq %rcx, 24(%r10)
; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v2i128:
@@ -1233,16 +1269,21 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: sbbl %r11d, %r11d
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %r11d
; SSE41-NEXT: addq %r8, %rdi
; SSE41-NEXT: adcq %r9, %rsi
-; SSE41-NEXT: sbbl %eax, %eax
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %eax
; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrd $1, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $8, %r11d, %xmm0
; SSE41-NEXT: movq %rdx, 16(%r10)
; SSE41-NEXT: movq %rdi, (%r10)
; SSE41-NEXT: movq %rcx, 24(%r10)
; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v2i128:
@@ -1250,16 +1291,21 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: sbbl %r11d, %r11d
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %r11d
; AVX1-NEXT: addq %r8, %rdi
; AVX1-NEXT: adcq %r9, %rsi
-; AVX1-NEXT: sbbl %eax, %eax
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
; AVX1-NEXT: movq %rdx, 16(%r10)
; AVX1-NEXT: movq %rdi, (%r10)
; AVX1-NEXT: movq %rcx, 24(%r10)
; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v2i128:
@@ -1267,16 +1313,21 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: sbbl %r11d, %r11d
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %r11d
; AVX2-NEXT: addq %r8, %rdi
; AVX2-NEXT: adcq %r9, %rsi
-; AVX2-NEXT: sbbl %eax, %eax
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
; AVX2-NEXT: movq %rdx, 16(%r10)
; AVX2-NEXT: movq %rdi, (%r10)
; AVX2-NEXT: movq %rcx, 24(%r10)
; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i128:
@@ -1293,12 +1344,12 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
; AVX512-NEXT: andl $1, %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %rcx, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
Modified: llvm/trunk/test/CodeGen/X86/vec_umulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_umulo.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_umulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_umulo.ll Tue Aug 6 16:00:43 2019
@@ -57,105 +57,205 @@ define <1 x i32> @umulo_v1i32(<1 x i32>
define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: umulo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE2-NEXT: movq %xmm3, %r8
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT: movq %xmm2, %r10
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movq %xmm1, %rdx
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: mulq %rdx
+; SSE2-NEXT: movq $-1, %r9
+; SSE2-NEXT: movl $0, %ecx
+; SSE2-NEXT: cmovoq %r9, %rcx
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movq %r8, %rax
+; SSE2-NEXT: mulq %r10
+; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSE2-NEXT: psrlq $32, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: cmovoq %r9, %rsi
+; SSE2-NEXT: movq %rsi, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movq %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: umulo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movq %xmm3, %r8
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movq %xmm2, %r10
+; SSSE3-NEXT: movq %xmm0, %rax
+; SSSE3-NEXT: movq %xmm1, %rdx
+; SSSE3-NEXT: xorl %esi, %esi
+; SSSE3-NEXT: mulq %rdx
+; SSSE3-NEXT: movq $-1, %r9
+; SSSE3-NEXT: movl $0, %ecx
+; SSSE3-NEXT: cmovoq %r9, %rcx
+; SSSE3-NEXT: movq %rax, %xmm0
+; SSSE3-NEXT: movq %r8, %rax
+; SSSE3-NEXT: mulq %r10
+; SSSE3-NEXT: movq %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-NEXT: psrlq $32, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movq %rcx, %xmm0
+; SSSE3-NEXT: cmovoq %r9, %rsi
+; SSSE3-NEXT: movq %rsi, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: movq %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: umulo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pmuludq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movq %xmm0, %r8
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movq %xmm1, %rcx
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: pextrq $1, %xmm1, %rdx
+; SSE41-NEXT: xorl %esi, %esi
+; SSE41-NEXT: mulq %rdx
+; SSE41-NEXT: movq %rax, %r9
+; SSE41-NEXT: movq $-1, %r10
+; SSE41-NEXT: movl $0, %eax
+; SSE41-NEXT: cmovoq %r10, %rax
+; SSE41-NEXT: movq %rax, %xmm0
+; SSE41-NEXT: movq %r8, %rax
+; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: cmovoq %r10, %rsi
+; SSE41-NEXT: movq %rsi, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movq %r9, %xmm0
+; SSE41-NEXT: movq %rax, %xmm3
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
+; SSE41-NEXT: psrlq $32, %xmm3
+; SSE41-NEXT: pcmpeqq %xmm2, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movq %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: umulo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vmovq %xmm0, %r8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX1-NEXT: xorl %esi, %esi
+; AVX1-NEXT: mulq %rdx
+; AVX1-NEXT: movq %rax, %r9
+; AVX1-NEXT: movq $-1, %r10
+; AVX1-NEXT: movl $0, %eax
+; AVX1-NEXT: cmovoq %r10, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: movq %r8, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: cmovoq %r10, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %r9, %xmm1
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: umulo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vmovq %xmm0, %r8
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: mulq %rdx
+; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movq $-1, %r10
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: cmovoq %r10, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: movq %r8, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: cmovoq %r10, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovq %r9, %xmm1
+; AVX2-NEXT: vmovq %rax, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vmovdqa %xmm2, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: umulo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
-; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
-; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: mulq %rdx
+; AVX512-NEXT: seto %r8b
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: mulq %rsi
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1
+; AVX512-NEXT: vptestmq %xmm1, %xmm1, %k0
+; AVX512-NEXT: kmovd %r8d, %k1
+; AVX512-NEXT: kshiftlw $1, %k1, %k1
+; AVX512-NEXT: seto %al
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: kmovw %eax, %k2
+; AVX512-NEXT: korw %k1, %k2, %k1
+; AVX512-NEXT: korw %k1, %k0, %k1
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -1409,152 +1509,150 @@ define <2 x i32> @umulo_v2i64(<2 x i64>
; SSE2-LABEL: umulo_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %r8
+; SSE2-NEXT: movq %xmm2, %r9
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %r10
+; SSE2-NEXT: movq %xmm2, %rsi
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movq %xmm1, %rdx
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: mulq %rdx
-; SSE2-NEXT: movq $-1, %r9
-; SSE2-NEXT: movl $0, %esi
-; SSE2-NEXT: cmovoq %r9, %rsi
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %r10
+; SSE2-NEXT: movq %rax, %r8
+; SSE2-NEXT: movq $-1, %r10
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: cmovoq %r10, %rax
; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movq %rsi, %xmm0
-; SSE2-NEXT: cmovoq %r9, %rcx
-; SSE2-NEXT: movq %rcx, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movq %r9, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: cmovoq %r10, %rcx
+; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movq %r8, %xmm1
+; SSE2-NEXT: movq %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: movdqa %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: umulo_v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %r8
+; SSSE3-NEXT: movq %xmm2, %r9
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %r10
+; SSSE3-NEXT: movq %xmm2, %rsi
; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: movq %xmm1, %rdx
; SSSE3-NEXT: xorl %ecx, %ecx
; SSSE3-NEXT: mulq %rdx
-; SSSE3-NEXT: movq $-1, %r9
-; SSSE3-NEXT: movl $0, %esi
-; SSSE3-NEXT: cmovoq %r9, %rsi
-; SSSE3-NEXT: movq %rax, %xmm1
-; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %r10
+; SSSE3-NEXT: movq %rax, %r8
+; SSSE3-NEXT: movq $-1, %r10
+; SSSE3-NEXT: movl $0, %eax
+; SSSE3-NEXT: cmovoq %r10, %rax
; SSSE3-NEXT: movq %rax, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movq %rsi, %xmm0
-; SSSE3-NEXT: cmovoq %r9, %rcx
-; SSSE3-NEXT: movq %rcx, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movq %r9, %rax
+; SSSE3-NEXT: mulq %rsi
+; SSSE3-NEXT: cmovoq %r10, %rcx
+; SSSE3-NEXT: movq %rcx, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movq %r8, %xmm1
+; SSSE3-NEXT: movq %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: movdqa %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: umulo_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm0, %r10
-; SSE41-NEXT: movq %xmm1, %r8
+; SSE41-NEXT: movq %xmm0, %rcx
+; SSE41-NEXT: movq %xmm1, %r9
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: pextrq $1, %xmm1, %rdx
; SSE41-NEXT: xorl %esi, %esi
; SSE41-NEXT: mulq %rdx
-; SSE41-NEXT: movq $-1, %r9
-; SSE41-NEXT: movl $0, %ecx
-; SSE41-NEXT: cmovoq %r9, %rcx
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r8
+; SSE41-NEXT: movq %rax, %r8
+; SSE41-NEXT: movq $-1, %r10
+; SSE41-NEXT: movl $0, %eax
+; SSE41-NEXT: cmovoq %r10, %rax
; SSE41-NEXT: movq %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: movq %rcx, %xmm0
-; SSE41-NEXT: cmovoq %r9, %rsi
-; SSE41-NEXT: movq %rsi, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE41-NEXT: movdqa %xmm1, (%rdi)
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %r9
+; SSE41-NEXT: cmovoq %r10, %rsi
+; SSE41-NEXT: movq %rsi, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: movq %r8, %xmm1
+; SSE41-NEXT: movq %rax, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE41-NEXT: movdqa %xmm2, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: umulo_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %xmm0, %r10
-; AVX1-NEXT: vmovq %xmm1, %r8
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: vmovq %xmm1, %r9
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
; AVX1-NEXT: xorl %esi, %esi
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: movq $-1, %r9
-; AVX1-NEXT: movl $0, %ecx
-; AVX1-NEXT: cmovoq %r9, %rcx
+; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq $-1, %r10
+; AVX1-NEXT: movl $0, %eax
+; AVX1-NEXT: cmovoq %r10, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: movq %r10, %rax
-; AVX1-NEXT: mulq %r8
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: cmovoq %r9, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %r9
+; AVX1-NEXT: cmovoq %r10, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %r8, %xmm1
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: umulo_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %xmm0, %r10
-; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: vmovq %xmm1, %r9
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
; AVX2-NEXT: xorl %esi, %esi
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq $-1, %r9
-; AVX2-NEXT: movl $0, %ecx
-; AVX2-NEXT: cmovoq %r9, %rcx
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq $-1, %r10
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: cmovoq %r10, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: movq %r10, %rax
-; AVX2-NEXT: mulq %r8
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: cmovoq %r9, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %r9
+; AVX2-NEXT: cmovoq %r10, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovq %r8, %xmm1
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: umulo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %r8
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vmovq %xmm1, %rdx
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
; AVX512-NEXT: mulq %rdx
-; AVX512-NEXT: movq %rax, %rsi
-; AVX512-NEXT: seto %r9b
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: mulq %r8
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vmovq %rsi, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX512-NEXT: movq %rax, %r8
; AVX512-NEXT: seto %al
; AVX512-NEXT: kmovd %eax, %k0
-; AVX512-NEXT: kmovd %r9d, %k1
-; AVX512-NEXT: kshiftrw $1, %k1, %k2
-; AVX512-NEXT: kxorw %k0, %k2, %k0
-; AVX512-NEXT: kshiftlw $15, %k0, %k0
-; AVX512-NEXT: kshiftrw $14, %k0, %k0
-; AVX512-NEXT: kxorw %k0, %k1, %k1
+; AVX512-NEXT: kshiftlw $1, %k0, %k0
+; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: mulq %rsi
+; AVX512-NEXT: seto %cl
+; AVX512-NEXT: andl $1, %ecx
+; AVX512-NEXT: kmovw %ecx, %k1
+; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vmovq %r8, %xmm0
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -2069,67 +2167,66 @@ define <2 x i32> @umulo_v2i128(<2 x i128
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq %r9, %r10
-; SSE2-NEXT: movq %rcx, %r12
-; SSE2-NEXT: movq %rdx, %r11
-; SSE2-NEXT: movq %rsi, %rax
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: movq %rdi, %r11
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: testq %r10, %r10
-; SSE2-NEXT: setne %cl
+; SSE2-NEXT: setne %dl
+; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: setne %r13b
+; SSE2-NEXT: andb %dl, %r13b
+; SSE2-NEXT: mulq %r15
+; SSE2-NEXT: movq %rax, %rdi
+; SSE2-NEXT: seto %bpl
+; SSE2-NEXT: movq %r10, %rax
+; SSE2-NEXT: mulq %r12
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: seto %cl
+; SSE2-NEXT: orb %bpl, %cl
+; SSE2-NEXT: addq %rdi, %rbx
+; SSE2-NEXT: movq %r12, %rax
+; SSE2-NEXT: mulq %r15
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: movq %rdx, %r15
+; SSE2-NEXT: addq %rbx, %r15
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: orb %cl, %al
+; SSE2-NEXT: orb %r13b, %al
+; SSE2-NEXT: movzbl %al, %ebp
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setne %al
; SSE2-NEXT: testq %rsi, %rsi
; SSE2-NEXT: setne %r13b
-; SSE2-NEXT: andb %cl, %r13b
+; SSE2-NEXT: andb %al, %r13b
+; SSE2-NEXT: movq %rsi, %rax
; SSE2-NEXT: mulq %r8
; SSE2-NEXT: movq %rax, %rsi
-; SSE2-NEXT: seto %bpl
-; SSE2-NEXT: movq %r10, %rax
-; SSE2-NEXT: mulq %rdi
-; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: seto %r12b
+; SSE2-NEXT: movq %r9, %rax
+; SSE2-NEXT: mulq %r11
+; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: seto %bl
-; SSE2-NEXT: orb %bpl, %bl
-; SSE2-NEXT: addq %rsi, %rcx
-; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: orb %r12b, %bl
+; SSE2-NEXT: addq %rsi, %rdi
+; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: addq %rcx, %rsi
+; SSE2-NEXT: addq %rdi, %rdx
; SSE2-NEXT: setb %cl
; SSE2-NEXT: orb %bl, %cl
; SSE2-NEXT: orb %r13b, %cl
-; SSE2-NEXT: testq %r9, %r9
-; SSE2-NEXT: setne %al
-; SSE2-NEXT: testq %r12, %r12
-; SSE2-NEXT: setne %r8b
-; SSE2-NEXT: andb %al, %r8b
-; SSE2-NEXT: movq %r12, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rax, %rbp
-; SSE2-NEXT: seto %r10b
-; SSE2-NEXT: movq %r9, %rax
-; SSE2-NEXT: mulq %r11
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: seto %r9b
-; SSE2-NEXT: orb %r10b, %r9b
-; SSE2-NEXT: addq %rbp, %rbx
-; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: addq %rbx, %rdx
-; SSE2-NEXT: setb %bl
-; SSE2-NEXT: orb %r9b, %bl
-; SSE2-NEXT: orb %r8b, %bl
-; SSE2-NEXT: movzbl %bl, %ebp
-; SSE2-NEXT: negl %ebp
-; SSE2-NEXT: movd %ebp, %xmm1
; SSE2-NEXT: movzbl %cl, %ecx
-; SSE2-NEXT: negl %ecx
; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rax, 16(%r14)
-; SSE2-NEXT: movq %rdi, (%r14)
-; SSE2-NEXT: movq %rdx, 24(%r14)
-; SSE2-NEXT: movq %rsi, 8(%r14)
+; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT: movq %r10, 16(%r14)
+; SSE2-NEXT: movq %rax, (%r14)
+; SSE2-NEXT: movq %r15, 24(%r14)
+; SSE2-NEXT: movq %rdx, 8(%r14)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -2146,67 +2243,66 @@ define <2 x i32> @umulo_v2i128(<2 x i128
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq %r9, %r10
-; SSSE3-NEXT: movq %rcx, %r12
-; SSSE3-NEXT: movq %rdx, %r11
-; SSSE3-NEXT: movq %rsi, %rax
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: movq %rdi, %r11
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: testq %r10, %r10
-; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: setne %dl
+; SSSE3-NEXT: testq %rcx, %rcx
+; SSSE3-NEXT: setne %r13b
+; SSSE3-NEXT: andb %dl, %r13b
+; SSSE3-NEXT: mulq %r15
+; SSSE3-NEXT: movq %rax, %rdi
+; SSSE3-NEXT: seto %bpl
+; SSSE3-NEXT: movq %r10, %rax
+; SSSE3-NEXT: mulq %r12
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: seto %cl
+; SSSE3-NEXT: orb %bpl, %cl
+; SSSE3-NEXT: addq %rdi, %rbx
+; SSSE3-NEXT: movq %r12, %rax
+; SSSE3-NEXT: mulq %r15
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: movq %rdx, %r15
+; SSSE3-NEXT: addq %rbx, %r15
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: orb %cl, %al
+; SSSE3-NEXT: orb %r13b, %al
+; SSSE3-NEXT: movzbl %al, %ebp
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setne %al
; SSSE3-NEXT: testq %rsi, %rsi
; SSSE3-NEXT: setne %r13b
-; SSSE3-NEXT: andb %cl, %r13b
+; SSSE3-NEXT: andb %al, %r13b
+; SSSE3-NEXT: movq %rsi, %rax
; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: movq %rax, %rsi
-; SSSE3-NEXT: seto %bpl
-; SSSE3-NEXT: movq %r10, %rax
-; SSSE3-NEXT: mulq %rdi
-; SSSE3-NEXT: movq %rax, %rcx
+; SSSE3-NEXT: seto %r12b
+; SSSE3-NEXT: movq %r9, %rax
+; SSSE3-NEXT: mulq %r11
+; SSSE3-NEXT: movq %rax, %rdi
; SSSE3-NEXT: seto %bl
-; SSSE3-NEXT: orb %bpl, %bl
-; SSSE3-NEXT: addq %rsi, %rcx
-; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: orb %r12b, %bl
+; SSSE3-NEXT: addq %rsi, %rdi
+; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: addq %rcx, %rsi
+; SSSE3-NEXT: addq %rdi, %rdx
; SSSE3-NEXT: setb %cl
; SSSE3-NEXT: orb %bl, %cl
; SSSE3-NEXT: orb %r13b, %cl
-; SSSE3-NEXT: testq %r9, %r9
-; SSSE3-NEXT: setne %al
-; SSSE3-NEXT: testq %r12, %r12
-; SSSE3-NEXT: setne %r8b
-; SSSE3-NEXT: andb %al, %r8b
-; SSSE3-NEXT: movq %r12, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rax, %rbp
-; SSSE3-NEXT: seto %r10b
-; SSSE3-NEXT: movq %r9, %rax
-; SSSE3-NEXT: mulq %r11
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: seto %r9b
-; SSSE3-NEXT: orb %r10b, %r9b
-; SSSE3-NEXT: addq %rbp, %rbx
-; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: addq %rbx, %rdx
-; SSSE3-NEXT: setb %bl
-; SSSE3-NEXT: orb %r9b, %bl
-; SSSE3-NEXT: orb %r8b, %bl
-; SSSE3-NEXT: movzbl %bl, %ebp
-; SSSE3-NEXT: negl %ebp
-; SSSE3-NEXT: movd %ebp, %xmm1
; SSSE3-NEXT: movzbl %cl, %ecx
-; SSSE3-NEXT: negl %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rax, 16(%r14)
-; SSSE3-NEXT: movq %rdi, (%r14)
-; SSSE3-NEXT: movq %rdx, 24(%r14)
-; SSSE3-NEXT: movq %rsi, 8(%r14)
+; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT: movq %r10, 16(%r14)
+; SSSE3-NEXT: movq %rax, (%r14)
+; SSSE3-NEXT: movq %r15, 24(%r14)
+; SSSE3-NEXT: movq %rdx, 8(%r14)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -2223,66 +2319,66 @@ define <2 x i32> @umulo_v2i128(<2 x i128
; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq %r9, %r10
-; SSE41-NEXT: movq %rcx, %r12
-; SSE41-NEXT: movq %rdx, %r11
-; SSE41-NEXT: movq %rsi, %rax
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: movq %rdi, %r11
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: testq %r10, %r10
-; SSE41-NEXT: setne %cl
+; SSE41-NEXT: setne %dl
+; SSE41-NEXT: testq %rcx, %rcx
+; SSE41-NEXT: setne %r13b
+; SSE41-NEXT: andb %dl, %r13b
+; SSE41-NEXT: mulq %r15
+; SSE41-NEXT: movq %rax, %rdi
+; SSE41-NEXT: seto %bpl
+; SSE41-NEXT: movq %r10, %rax
+; SSE41-NEXT: mulq %r12
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: seto %cl
+; SSE41-NEXT: orb %bpl, %cl
+; SSE41-NEXT: addq %rdi, %rbx
+; SSE41-NEXT: movq %r12, %rax
+; SSE41-NEXT: mulq %r15
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: movq %rdx, %r15
+; SSE41-NEXT: addq %rbx, %r15
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: orb %cl, %al
+; SSE41-NEXT: orb %r13b, %al
+; SSE41-NEXT: movzbl %al, %ebp
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setne %al
; SSE41-NEXT: testq %rsi, %rsi
; SSE41-NEXT: setne %r13b
-; SSE41-NEXT: andb %cl, %r13b
+; SSE41-NEXT: andb %al, %r13b
+; SSE41-NEXT: movq %rsi, %rax
; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rax, %rsi
-; SSE41-NEXT: seto %bpl
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %rdi
-; SSE41-NEXT: movq %rax, %rcx
+; SSE41-NEXT: seto %r12b
+; SSE41-NEXT: movq %r9, %rax
+; SSE41-NEXT: mulq %r11
+; SSE41-NEXT: movq %rax, %rdi
; SSE41-NEXT: seto %bl
-; SSE41-NEXT: orb %bpl, %bl
-; SSE41-NEXT: addq %rsi, %rcx
-; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: orb %r12b, %bl
+; SSE41-NEXT: addq %rsi, %rdi
+; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: addq %rcx, %rsi
+; SSE41-NEXT: addq %rdi, %rdx
; SSE41-NEXT: setb %cl
; SSE41-NEXT: orb %bl, %cl
; SSE41-NEXT: orb %r13b, %cl
-; SSE41-NEXT: testq %r9, %r9
-; SSE41-NEXT: setne %al
-; SSE41-NEXT: testq %r12, %r12
-; SSE41-NEXT: setne %r8b
-; SSE41-NEXT: andb %al, %r8b
-; SSE41-NEXT: movq %r12, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rax, %rbp
-; SSE41-NEXT: seto %r10b
-; SSE41-NEXT: movq %r9, %rax
-; SSE41-NEXT: mulq %r11
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: seto %r9b
-; SSE41-NEXT: orb %r10b, %r9b
-; SSE41-NEXT: addq %rbp, %rbx
-; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: addq %rbx, %rdx
-; SSE41-NEXT: setb %bl
-; SSE41-NEXT: orb %r9b, %bl
-; SSE41-NEXT: orb %r8b, %bl
-; SSE41-NEXT: movzbl %bl, %ebp
-; SSE41-NEXT: negl %ebp
; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: negl %ecx
; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrd $1, %ebp, %xmm0
-; SSE41-NEXT: movq %rax, 16(%r14)
-; SSE41-NEXT: movq %rdi, (%r14)
-; SSE41-NEXT: movq %rdx, 24(%r14)
-; SSE41-NEXT: movq %rsi, 8(%r14)
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: movq %r10, 16(%r14)
+; SSE41-NEXT: movq %rax, (%r14)
+; SSE41-NEXT: movq %r15, 24(%r14)
+; SSE41-NEXT: movq %rdx, 8(%r14)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
@@ -2299,66 +2395,66 @@ define <2 x i32> @umulo_v2i128(<2 x i128
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq %r9, %r10
-; AVX1-NEXT: movq %rcx, %r12
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: movq %rsi, %rax
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdi, %r11
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: testq %r10, %r10
-; AVX1-NEXT: setne %cl
-; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setne %dl
+; AVX1-NEXT: testq %rcx, %rcx
; AVX1-NEXT: setne %r13b
-; AVX1-NEXT: andb %cl, %r13b
-; AVX1-NEXT: mulq %r8
-; AVX1-NEXT: movq %rax, %rsi
+; AVX1-NEXT: andb %dl, %r13b
+; AVX1-NEXT: mulq %r15
+; AVX1-NEXT: movq %rax, %rdi
; AVX1-NEXT: seto %bpl
; AVX1-NEXT: movq %r10, %rax
-; AVX1-NEXT: mulq %rdi
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: seto %bl
-; AVX1-NEXT: orb %bpl, %bl
-; AVX1-NEXT: addq %rsi, %rcx
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: mulq %r8
-; AVX1-NEXT: movq %rax, %rdi
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: addq %rcx, %rsi
-; AVX1-NEXT: setb %cl
-; AVX1-NEXT: orb %bl, %cl
-; AVX1-NEXT: orb %r13b, %cl
-; AVX1-NEXT: testq %r9, %r9
-; AVX1-NEXT: setne %al
-; AVX1-NEXT: testq %r12, %r12
-; AVX1-NEXT: setne %r8b
-; AVX1-NEXT: andb %al, %r8b
+; AVX1-NEXT: mulq %r12
+; AVX1-NEXT: movq %rax, %rbx
+; AVX1-NEXT: seto %cl
+; AVX1-NEXT: orb %bpl, %cl
+; AVX1-NEXT: addq %rdi, %rbx
; AVX1-NEXT: movq %r12, %rax
; AVX1-NEXT: mulq %r15
-; AVX1-NEXT: movq %rax, %rbp
-; AVX1-NEXT: seto %r10b
+; AVX1-NEXT: movq %rax, %r10
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: addq %rbx, %r15
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: orb %cl, %al
+; AVX1-NEXT: orb %r13b, %al
+; AVX1-NEXT: movzbl %al, %ebp
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setne %al
+; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setne %r13b
+; AVX1-NEXT: andb %al, %r13b
+; AVX1-NEXT: movq %rsi, %rax
+; AVX1-NEXT: mulq %r8
+; AVX1-NEXT: movq %rax, %rsi
+; AVX1-NEXT: seto %r12b
; AVX1-NEXT: movq %r9, %rax
; AVX1-NEXT: mulq %r11
-; AVX1-NEXT: movq %rax, %rbx
-; AVX1-NEXT: seto %r9b
-; AVX1-NEXT: orb %r10b, %r9b
-; AVX1-NEXT: addq %rbp, %rbx
+; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: seto %cl
+; AVX1-NEXT: orb %r12b, %cl
+; AVX1-NEXT: addq %rsi, %rdi
; AVX1-NEXT: movq %r11, %rax
-; AVX1-NEXT: mulq %r15
-; AVX1-NEXT: addq %rbx, %rdx
+; AVX1-NEXT: mulq %r8
+; AVX1-NEXT: addq %rdi, %rdx
; AVX1-NEXT: setb %bl
-; AVX1-NEXT: orb %r9b, %bl
-; AVX1-NEXT: orb %r8b, %bl
-; AVX1-NEXT: movzbl %bl, %ebp
-; AVX1-NEXT: negl %ebp
-; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: negl %ecx
+; AVX1-NEXT: orb %cl, %bl
+; AVX1-NEXT: orb %r13b, %bl
+; AVX1-NEXT: movzbl %bl, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, 16(%r14)
-; AVX1-NEXT: movq %rdi, (%r14)
-; AVX1-NEXT: movq %rdx, 24(%r14)
-; AVX1-NEXT: movq %rsi, 8(%r14)
+; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT: movq %r10, 16(%r14)
+; AVX1-NEXT: movq %rax, (%r14)
+; AVX1-NEXT: movq %r15, 24(%r14)
+; AVX1-NEXT: movq %rdx, 8(%r14)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX1-NEXT: popq %r13
@@ -2375,66 +2471,66 @@ define <2 x i32> @umulo_v2i128(<2 x i128
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq %r9, %r10
-; AVX2-NEXT: movq %rcx, %r12
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: movq %rsi, %rax
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdi, %r11
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: setne %cl
-; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setne %dl
+; AVX2-NEXT: testq %rcx, %rcx
; AVX2-NEXT: setne %r13b
-; AVX2-NEXT: andb %cl, %r13b
-; AVX2-NEXT: mulq %r8
-; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: andb %dl, %r13b
+; AVX2-NEXT: mulq %r15
+; AVX2-NEXT: movq %rax, %rdi
; AVX2-NEXT: seto %bpl
; AVX2-NEXT: movq %r10, %rax
-; AVX2-NEXT: mulq %rdi
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: seto %bl
-; AVX2-NEXT: orb %bpl, %bl
-; AVX2-NEXT: addq %rsi, %rcx
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: mulq %r8
-; AVX2-NEXT: movq %rax, %rdi
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: addq %rcx, %rsi
-; AVX2-NEXT: setb %cl
-; AVX2-NEXT: orb %bl, %cl
-; AVX2-NEXT: orb %r13b, %cl
-; AVX2-NEXT: testq %r9, %r9
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: setne %r8b
-; AVX2-NEXT: andb %al, %r8b
+; AVX2-NEXT: mulq %r12
+; AVX2-NEXT: movq %rax, %rbx
+; AVX2-NEXT: seto %cl
+; AVX2-NEXT: orb %bpl, %cl
+; AVX2-NEXT: addq %rdi, %rbx
; AVX2-NEXT: movq %r12, %rax
; AVX2-NEXT: mulq %r15
-; AVX2-NEXT: movq %rax, %rbp
-; AVX2-NEXT: seto %r10b
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: addq %rbx, %r15
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: orb %cl, %al
+; AVX2-NEXT: orb %r13b, %al
+; AVX2-NEXT: movzbl %al, %ebp
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setne %r13b
+; AVX2-NEXT: andb %al, %r13b
+; AVX2-NEXT: movq %rsi, %rax
+; AVX2-NEXT: mulq %r8
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: seto %r12b
; AVX2-NEXT: movq %r9, %rax
; AVX2-NEXT: mulq %r11
-; AVX2-NEXT: movq %rax, %rbx
-; AVX2-NEXT: seto %r9b
-; AVX2-NEXT: orb %r10b, %r9b
-; AVX2-NEXT: addq %rbp, %rbx
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: seto %cl
+; AVX2-NEXT: orb %r12b, %cl
+; AVX2-NEXT: addq %rsi, %rdi
; AVX2-NEXT: movq %r11, %rax
-; AVX2-NEXT: mulq %r15
-; AVX2-NEXT: addq %rbx, %rdx
+; AVX2-NEXT: mulq %r8
+; AVX2-NEXT: addq %rdi, %rdx
; AVX2-NEXT: setb %bl
-; AVX2-NEXT: orb %r9b, %bl
-; AVX2-NEXT: orb %r8b, %bl
-; AVX2-NEXT: movzbl %bl, %ebp
-; AVX2-NEXT: negl %ebp
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: negl %ecx
+; AVX2-NEXT: orb %cl, %bl
+; AVX2-NEXT: orb %r13b, %bl
+; AVX2-NEXT: movzbl %bl, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, 16(%r14)
-; AVX2-NEXT: movq %rdi, (%r14)
-; AVX2-NEXT: movq %rdx, 24(%r14)
-; AVX2-NEXT: movq %rsi, 8(%r14)
+; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT: movq %r10, 16(%r14)
+; AVX2-NEXT: movq %rax, (%r14)
+; AVX2-NEXT: movq %r15, 24(%r14)
+; AVX2-NEXT: movq %rdx, 8(%r14)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
@@ -2505,12 +2601,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128
; AVX512-NEXT: andl $1, %esi
; AVX512-NEXT: kmovw %esi, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %r10, 16(%r14)
; AVX512-NEXT: movq %rax, (%r14)
; AVX512-NEXT: movq %r15, 24(%r14)
; AVX512-NEXT: movq %rdx, 8(%r14)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
Modified: llvm/trunk/test/CodeGen/X86/vec_usubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_usubo.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_usubo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_usubo.ll Tue Aug 6 16:00:43 2019
@@ -47,66 +47,91 @@ define <1 x i32> @usubo_v1i32(<1 x i32>
define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: usubo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: psubq %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: psubq %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: pcmpeqq %xmm0, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movq %xmm2, (%rdi)
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -938,12 +963,12 @@ define <2 x i32> @usubo_v2i64(<2 x i64>
; SSE-NEXT: pxor %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -955,7 +980,6 @@ define <2 x i32> @usubo_v2i64(<2 x i64>
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
@@ -966,7 +990,6 @@ define <2 x i32> @usubo_v2i64(<2 x i64>
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
@@ -974,9 +997,9 @@ define <2 x i32> @usubo_v2i64(<2 x i64>
; AVX512: # %bb.0:
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -1244,17 +1267,21 @@ define <2 x i32> @usubo_v2i128(<2 x i128
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: sbbl %eax, %eax
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %r11d
; SSE2-NEXT: subq %r8, %rdi
; SSE2-NEXT: sbbq %r9, %rsi
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: sbbl %eax, %eax
+; SSE2-NEXT: setb %al
+; SSE2-NEXT: movzbl %al, %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pinsrw $4, %r11d, %xmm0
; SSE2-NEXT: movq %rdx, 16(%r10)
; SSE2-NEXT: movq %rdi, (%r10)
; SSE2-NEXT: movq %rcx, 24(%r10)
; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v2i128:
@@ -1262,17 +1289,21 @@ define <2 x i32> @usubo_v2i128(<2 x i128
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: sbbl %eax, %eax
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %r11d
; SSSE3-NEXT: subq %r8, %rdi
; SSSE3-NEXT: sbbq %r9, %rsi
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: sbbl %eax, %eax
+; SSSE3-NEXT: setb %al
+; SSSE3-NEXT: movzbl %al, %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0
; SSSE3-NEXT: movq %rdx, 16(%r10)
; SSSE3-NEXT: movq %rdi, (%r10)
; SSSE3-NEXT: movq %rcx, 24(%r10)
; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: psllq $63, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v2i128:
@@ -1280,16 +1311,21 @@ define <2 x i32> @usubo_v2i128(<2 x i128
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: sbbl %r11d, %r11d
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %r11d
; SSE41-NEXT: subq %r8, %rdi
; SSE41-NEXT: sbbq %r9, %rsi
-; SSE41-NEXT: sbbl %eax, %eax
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: movzbl %al, %eax
; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrd $1, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $8, %r11d, %xmm0
; SSE41-NEXT: movq %rdx, 16(%r10)
; SSE41-NEXT: movq %rdi, (%r10)
; SSE41-NEXT: movq %rcx, 24(%r10)
; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: psllq $63, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v2i128:
@@ -1297,16 +1333,21 @@ define <2 x i32> @usubo_v2i128(<2 x i128
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: sbbl %r11d, %r11d
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %r11d
; AVX1-NEXT: subq %r8, %rdi
; AVX1-NEXT: sbbq %r9, %rsi
-; AVX1-NEXT: sbbl %eax, %eax
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: movzbl %al, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
; AVX1-NEXT: movq %rdx, 16(%r10)
; AVX1-NEXT: movq %rdi, (%r10)
; AVX1-NEXT: movq %rcx, 24(%r10)
; AVX1-NEXT: movq %rsi, 8(%r10)
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v2i128:
@@ -1314,16 +1355,21 @@ define <2 x i32> @usubo_v2i128(<2 x i128
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: sbbl %r11d, %r11d
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %r11d
; AVX2-NEXT: subq %r8, %rdi
; AVX2-NEXT: sbbq %r9, %rsi
-; AVX2-NEXT: sbbl %eax, %eax
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
; AVX2-NEXT: movq %rdx, 16(%r10)
; AVX2-NEXT: movq %rdi, (%r10)
; AVX2-NEXT: movq %rcx, 24(%r10)
; AVX2-NEXT: movq %rsi, 8(%r10)
+; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v2i128:
@@ -1340,12 +1386,12 @@ define <2 x i32> @usubo_v2i128(<2 x i128
; AVX512-NEXT: andl $1, %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %rcx, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
Modified: llvm/trunk/test/CodeGen/X86/vector-blend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-blend.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-blend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-blend.ll Tue Aug 6 16:00:43 2019
@@ -64,30 +64,24 @@ entry:
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
; SSE2-LABEL: vsel_4xi8:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: andps %xmm2, %xmm0
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_4xi8:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_4xi8:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_4xi8:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -97,28 +91,26 @@ entry:
define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
; SSE2-LABEL: vsel_4xi16:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: andps %xmm2, %xmm0
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_4xi16:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSSE3-NEXT: andps %xmm2, %xmm0
-; SSSE3-NEXT: andnps %xmm1, %xmm2
-; SSSE3-NEXT: orps %xmm2, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_4xi16:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_4xi16:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
Modified: llvm/trunk/test/CodeGen/X86/vector-ext-logic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-ext-logic.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-ext-logic.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-ext-logic.ll Tue Aug 6 16:00:43 2019
@@ -140,17 +140,14 @@ define <8 x i32> @sext_xor_v8i32(<8 x i1
define <8 x i16> @zext_and_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: zext_and_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_and_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vandps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%xz = zext <8 x i8> %x to <8 x i16>
%yz = zext <8 x i8> %y to <8 x i16>
@@ -161,17 +158,14 @@ define <8 x i16> @zext_and_v8i16(<8 x i8
define <8 x i16> @zext_or_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: zext_or_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_or_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%xz = zext <8 x i8> %x to <8 x i16>
%yz = zext <8 x i8> %y to <8 x i16>
@@ -182,17 +176,14 @@ define <8 x i16> @zext_or_v8i16(<8 x i8>
define <8 x i16> @zext_xor_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: zext_xor_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm0
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_xor_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%xz = zext <8 x i8> %x to <8 x i16>
%yz = zext <8 x i8> %y to <8 x i16>
@@ -203,17 +194,19 @@ define <8 x i16> @zext_xor_v8i16(<8 x i8
define <8 x i16> @sext_and_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: sext_and_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: sext_and_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xs = sext <8 x i8> %x to <8 x i16>
@@ -225,17 +218,19 @@ define <8 x i16> @sext_and_v8i16(<8 x i8
define <8 x i16> @sext_or_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: sext_or_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: sext_or_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xs = sext <8 x i8> %x to <8 x i16>
@@ -247,17 +242,19 @@ define <8 x i16> @sext_or_v8i16(<8 x i8>
define <8 x i16> @sext_xor_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: sext_xor_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: sext_xor_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xs = sext <8 x i8> %x to <8 x i16>
@@ -298,13 +295,18 @@ define <8 x i32> @bool_zext_and(<8 x i1>
define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_zext_or:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
@@ -323,13 +325,18 @@ define <8 x i32> @bool_zext_or(<8 x i1>
define <8 x i32> @bool_zext_xor(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_zext_xor:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-gep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-gep.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-gep.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-gep.ll Tue Aug 6 16:00:43 2019
@@ -74,7 +74,8 @@ define <4 x i16*> @AGEP4(<4 x i16*> %par
define <4 x i8*> @AGEP5(<4 x i8*> %param, <4 x i8> %off) nounwind {
; CHECK-LABEL: AGEP5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovsxbd %xmm1, %xmm1
+; CHECK-NEXT: vpslld $24, %xmm1, %xmm1
+; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retl
%A = getelementptr i8, <4 x i8*> %param, <4 x i8> %off
Modified: llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll Tue Aug 6 16:00:43 2019
@@ -24,6 +24,7 @@ define float @cvt_i16_to_f32(i16 %a0) no
define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; ALL-LABEL: cvt_4i16_to_4f32:
; ALL: # %bb.0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movq %rax, %rdx
@@ -931,20 +932,88 @@ define double @cvt_i16_to_f64(i16 %a0) n
}
define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
-; ALL-LABEL: cvt_2i16_to_2f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovd %xmm0, %eax
-; ALL-NEXT: movswl %ax, %ecx
-; ALL-NEXT: shrl $16, %eax
-; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm0
-; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: vmovd %ecx, %xmm1
-; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; ALL-NEXT: retq
+; AVX1-LABEL: cvt_2i16_to_2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: cvt_2i16_to_2f64:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX2-SLOW-NEXT: movswl %ax, %ecx
+; AVX2-SLOW-NEXT: shrl $16, %eax
+; AVX2-SLOW-NEXT: cwtl
+; AVX2-SLOW-NEXT: vmovd %eax, %xmm0
+; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1
+; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: cvt_2i16_to_2f64:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: movswl %ax, %ecx
+; AVX2-FAST-NEXT: shrl $16, %eax
+; AVX2-FAST-NEXT: cwtl
+; AVX2-FAST-NEXT: vmovd %eax, %xmm0
+; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %ecx, %xmm1
+; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512F-LABEL: cvt_2i16_to_2f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: movswl %ax, %ecx
+; AVX512F-NEXT: shrl $16, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: cvt_2i16_to_2f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX512VL-NEXT: movswl %ax, %ecx
+; AVX512VL-NEXT: shrl $16, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovd %ecx, %xmm1
+; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = fpext <2 x half> %1 to <2 x double>
ret <2 x double> %2
@@ -953,6 +1022,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2
define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; ALL-LABEL: cvt_4i16_to_4f64:
; ALL: # %bb.0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movl %eax, %edx
@@ -1477,7 +1547,7 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
@@ -1502,7 +1572,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
@@ -1855,7 +1925,7 @@ define void @store_cvt_4f32_to_8i16_unde
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovaps %xmm0, (%rdi)
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
@@ -2162,12 +2232,12 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x
; ALL-NEXT: subq $40, %rsp
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, (%rsp)
+; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps (%rsp), %xmm0
+; ALL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; ALL-NEXT: addq $40, %rsp
; ALL-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
@@ -2189,7 +2259,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, (%rsp)
+; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
@@ -2198,7 +2268,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps (%rsp), %xmm0
+; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; ALL-NEXT: addq $88, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
@@ -2220,7 +2290,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, (%rsp)
+; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
@@ -2229,7 +2299,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps (%rsp), %xmm0
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: addq $88, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
@@ -2252,7 +2322,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, (%rsp)
+; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
@@ -2658,7 +2728,7 @@ define void @store_cvt_4f64_to_8i16_unde
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, (%rsp)
+; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
@@ -2667,7 +2737,7 @@ define void @store_cvt_4f64_to_8i16_unde
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovaps (%rsp), %xmm0
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovaps %xmm0, (%rbx)
; ALL-NEXT: addq $80, %rsp
; ALL-NEXT: popq %rbx
@@ -2695,7 +2765,7 @@ define void @store_cvt_4f64_to_8i16_zero
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, (%rsp)
+; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll Tue Aug 6 16:00:43 2019
@@ -8,40 +8,58 @@ define void @test_udiv7_v2i32(<2 x i32>*
; X64-LABEL: test_udiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: psubd %xmm2, %xmm0
-; X64-NEXT: psrld $1, %xmm0
-; X64-NEXT: paddd %xmm2, %xmm0
-; X64-NEXT: psrld $2, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %eax
+; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: subl %edx, %ecx
+; X64-NEXT: shrl %ecx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: shrl $2, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_udiv7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: psubd %xmm2, %xmm0
-; X86-NEXT: psrld $1, %xmm0
-; X86-NEXT: paddd %xmm2, %xmm0
-; X86-NEXT: psrld $2, %xmm0
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $613566757, %ebx # imm = 0x24924925
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: shrl %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: shrl $2, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_udiv7_v2i32:
@@ -92,50 +110,76 @@ define void @test_urem7_v2i32(<2 x i32>*
; X64-LABEL: test_urem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psubd %xmm2, %xmm1
-; X64-NEXT: psrld $1, %xmm1
-; X64-NEXT: paddd %xmm2, %xmm1
-; X64-NEXT: psrld $2, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pslld $3, %xmm2
-; X64-NEXT: psubd %xmm2, %xmm1
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: movq %xmm1, (%rsi)
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: subl %edx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: leal (,%rax,8), %edx
+; X64-NEXT: subl %edx, %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movl %ecx, %edi
+; X64-NEXT: subl %edx, %edi
+; X64-NEXT: shrl %edi
+; X64-NEXT: addl %edx, %edi
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: leal (,%rdi,8), %edx
+; X64-NEXT: subl %edx, %edi
+; X64-NEXT: addl %ecx, %edi
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: psubd %xmm2, %xmm1
-; X86-NEXT: psrld $1, %xmm1
-; X86-NEXT: paddd %xmm2, %xmm1
-; X86-NEXT: psrld $2, %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pslld $3, %xmm2
-; X86-NEXT: psubd %xmm2, %xmm1
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $613566757, %edi # imm = 0x24924925
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: shrl $2, %ebx
+; X86-NEXT: leal (,%ebx,8), %eax
+; X86-NEXT: subl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: leal (,%eax,8), %edx
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: movd %ebx, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%ebp)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem7_v2i32:
@@ -196,52 +240,67 @@ define void @test_sdiv7_v2i32(<2 x i32>*
; X64-LABEL: test_sdiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: pcmpgtd %xmm0, %xmm3
-; X64-NEXT: pand %xmm1, %xmm3
-; X64-NEXT: paddd %xmm0, %xmm3
-; X64-NEXT: psubd %xmm3, %xmm2
-; X64-NEXT: paddd %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm0
-; X64-NEXT: psrld $31, %xmm0
-; X64-NEXT: psrad $2, %xmm2
-; X64-NEXT: paddd %xmm0, %xmm2
-; X64-NEXT: movq %xmm2, (%rsi)
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %eax
+; X64-NEXT: cltq
+; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrl $31, %ecx
+; X64-NEXT: sarl $2, %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: movl %ecx, %edx
+; X64-NEXT: shrl $31, %edx
+; X64-NEXT: sarl $2, %ecx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86-NEXT: pxor %xmm3, %xmm3
-; X86-NEXT: pcmpgtd %xmm0, %xmm3
-; X86-NEXT: pand %xmm1, %xmm3
-; X86-NEXT: paddd %xmm0, %xmm3
-; X86-NEXT: psubd %xmm3, %xmm2
-; X86-NEXT: paddd %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm0
-; X86-NEXT: psrld $31, %xmm0
-; X86-NEXT: psrad $2, %xmm2
-; X86-NEXT: paddd %xmm0, %xmm2
-; X86-NEXT: movq %xmm2, (%eax)
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $-1840700269, %ebp # imm = 0x92492493
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edi
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %ebp
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movd %edi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%ebx)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_sdiv7_v2i32:
@@ -304,60 +363,79 @@ define void @test_srem7_v2i32(<2 x i32>*
; X64-LABEL: test_srem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: pcmpgtd %xmm0, %xmm3
-; X64-NEXT: pand %xmm1, %xmm3
-; X64-NEXT: paddd %xmm0, %xmm3
-; X64-NEXT: psubd %xmm3, %xmm2
-; X64-NEXT: paddd %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm1
-; X64-NEXT: psrld $31, %xmm1
-; X64-NEXT: psrad $2, %xmm2
-; X64-NEXT: paddd %xmm1, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm1
-; X64-NEXT: pslld $3, %xmm1
-; X64-NEXT: psubd %xmm1, %xmm2
-; X64-NEXT: paddd %xmm0, %xmm2
-; X64-NEXT: movq %xmm2, (%rsi)
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %eax
+; X64-NEXT: movslq %eax, %rcx
+; X64-NEXT: imulq $-1840700269, %rcx, %rax # imm = 0x92492493
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shrl $31, %edx
+; X64-NEXT: sarl $2, %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: leal (,%rax,8), %edx
+; X64-NEXT: subl %edx, %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: movl %edx, %edi
+; X64-NEXT: shrl $31, %edi
+; X64-NEXT: sarl $2, %edx
+; X64-NEXT: addl %edi, %edx
+; X64-NEXT: leal (,%rdx,8), %edi
+; X64-NEXT: subl %edi, %edx
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_srem7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86-NEXT: pxor %xmm3, %xmm3
-; X86-NEXT: pcmpgtd %xmm0, %xmm3
-; X86-NEXT: pand %xmm1, %xmm3
-; X86-NEXT: paddd %xmm0, %xmm3
-; X86-NEXT: psubd %xmm3, %xmm2
-; X86-NEXT: paddd %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm1
-; X86-NEXT: psrld $31, %xmm1
-; X86-NEXT: psrad $2, %xmm2
-; X86-NEXT: paddd %xmm1, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm1
-; X86-NEXT: pslld $3, %xmm1
-; X86-NEXT: psubd %xmm1, %xmm2
-; X86-NEXT: paddd %xmm0, %xmm2
-; X86-NEXT: movq %xmm2, (%eax)
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $-1840700269, %ebx # imm = 0x92492493
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edi
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: leal (,%edi,8), %eax
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %ebx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: leal (,%edx,8), %eax
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movd %edi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%ebp)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_srem7_v2i32:
@@ -428,7 +506,10 @@ define void @test_udiv_pow2_v2i32(<2 x i
; X64-LABEL: test_udiv_pow2_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: psrld $3, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: psrlq $3, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
@@ -437,7 +518,10 @@ define void @test_udiv_pow2_v2i32(<2 x i
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: psrlq $3, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
@@ -465,9 +549,14 @@ define void @test_udiv_pow2_v2i32(<2 x i
define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
; X64-LABEL: test_urem_pow2_v2i32:
; X64: # %bb.0:
-; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: andps {{.*}}(%rip), %xmm0
-; X64-NEXT: movlps %xmm0, (%rsi)
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: movl 4(%rdi), %ecx
+; X64-NEXT: movq %rcx, %xmm0
+; X64-NEXT: movq %rax, %xmm1
+; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: pand {{.*}}(%rip), %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem_pow2_v2i32:
@@ -475,8 +564,10 @@ define void @test_urem_pow2_v2i32(<2 x i
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
-; X86-NEXT: movlps %xmm0, (%eax)
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem_pow2_v2i32:
@@ -504,12 +595,23 @@ define void @test_sdiv_pow2_v2i32(<2 x i
; X64-LABEL: test_sdiv_pow2_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,1]
; X64-NEXT: psrad $31, %xmm1
-; X64-NEXT: psrld $29, %xmm1
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: psrad $3, %xmm1
-; X64-NEXT: movq %xmm1, (%rsi)
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: psrlq $31, %xmm0
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-NEXT: psrlq $29, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; X64-NEXT: psrad $31, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: psrlq $3, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv_pow2_v2i32:
@@ -517,12 +619,28 @@ define void @test_sdiv_pow2_v2i32(<2 x i
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: psrad $31, %xmm1
-; X86-NEXT: psrld $29, %xmm1
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: psrad $3, %xmm1
-; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-NEXT: psrlq $31, %xmm2
+; X86-NEXT: movsd {{.*#+}} xmm2 = xmm2[0,1]
+; X86-NEXT: movapd {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; X86-NEXT: xorpd %xmm1, %xmm2
+; X86-NEXT: psubq %xmm1, %xmm2
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm2
+; X86-NEXT: psrlq $29, %xmm2
+; X86-NEXT: paddq %xmm0, %xmm2
+; X86-NEXT: psllq $32, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; X86-NEXT: psrad $31, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: psrlq $3, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_sdiv_pow2_v2i32:
@@ -558,7 +676,10 @@ define void @test_srem_pow2_v2i32(<2 x i
; X64-LABEL: test_srem_pow2_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: psrld $3, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: psrlq $3, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
@@ -567,7 +688,10 @@ define void @test_srem_pow2_v2i32(<2 x i
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: psrlq $3, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
@@ -598,45 +722,52 @@ define void @test_udiv_v2i32(<2 x i32>*
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %esi
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %esi
-; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movl %eax, %esi
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-NEXT: movd %xmm0, %esi
+; X64-NEXT: movd %xmm1, %edi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %esi
+; X64-NEXT: divl %edi
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT: movq %xmm2, (%rcx)
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_udiv_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
-; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
-; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %esi
+; X86-NEXT: divl %ebx
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-NEXT: movq %xmm2, (%ecx)
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_udiv_v2i32:
@@ -697,45 +828,52 @@ define void @test_urem_v2i32(<2 x i32>*
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %esi
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %esi
-; X64-NEXT: movd %edx, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movl %edx, %esi
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-NEXT: movd %xmm0, %esi
+; X64-NEXT: movd %xmm1, %edi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %esi
+; X64-NEXT: divl %edi
; X64-NEXT: movd %edx, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT: movq %xmm2, (%rcx)
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_urem_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
-; X86-NEXT: movd %edx, %xmm2
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
-; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %esi
+; X86-NEXT: divl %ebx
; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-NEXT: movq %xmm2, (%ecx)
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem_v2i32:
@@ -796,20 +934,21 @@ define void @test_sdiv_v2i32(<2 x i32>*
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %esi
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: cltd
; X64-NEXT: idivl %esi
-; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movl %eax, %esi
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-NEXT: movd %xmm0, %esi
+; X64-NEXT: movd %xmm1, %edi
; X64-NEXT: cltd
-; X64-NEXT: idivl %esi
+; X64-NEXT: idivl %edi
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT: movq %xmm2, (%rcx)
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv_v2i32:
@@ -817,26 +956,27 @@ define void @test_sdiv_v2i32(<2 x i32>*
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: movd %xmm1, %edi
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: cltd
-; X86-NEXT: idivl %ebx
-; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: idivl %esi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
-; X86-NEXT: idivl %edi
-; X86-NEXT: movd %eax, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: movq %xmm1, (%esi)
+; X86-NEXT: idivl %ebx
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -905,20 +1045,21 @@ define void @test_srem_v2i32(<2 x i32>*
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %esi
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: cltd
; X64-NEXT: idivl %esi
-; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movl %eax, %esi
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-NEXT: movd %xmm0, %esi
+; X64-NEXT: movd %xmm1, %edi
; X64-NEXT: cltd
-; X64-NEXT: idivl %esi
+; X64-NEXT: idivl %edi
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT: movq %xmm2, (%rcx)
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_srem_v2i32:
@@ -926,26 +1067,27 @@ define void @test_srem_v2i32(<2 x i32>*
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: movd %xmm1, %edi
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: cltd
-; X86-NEXT: idivl %ebx
-; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: idivl %esi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
-; X86-NEXT: idivl %edi
-; X86-NEXT: movd %eax, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: movq %xmm1, (%esi)
+; X86-NEXT: idivl %ebx
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
Modified: llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll Tue Aug 6 16:00:43 2019
@@ -107,20 +107,34 @@ define <2 x i8> @PR39893(<2 x i32> %x, <
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
-; SSE-NEXT: psrld $16, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: PR39893:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: retq
+; AVX1-LABEL: PR39893:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR39893:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: PR39893:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
%shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> <i32 10, i32 4>
@@ -132,13 +146,16 @@ define <2 x i8> @PR39893_2(<2 x float> %
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: subps %xmm0, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: PR39893_2:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%fsub = fsub <2 x float> zeroinitializer, %x
%bc = bitcast <2 x float> %fsub to <8 x i8>
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll Tue Aug 6 16:00:43 2019
@@ -198,35 +198,22 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddq %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
-; AVX1-SLOW-LABEL: test_v2i32:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v2i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
@@ -510,40 +497,24 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddq %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
-; AVX1-SLOW-LABEL: test_v2i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v2i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -554,50 +525,49 @@ define i16 @test_v2i16(<2 x i16> %a0) {
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4i16:
; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: test_v4i16:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: test_v4i16:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -944,34 +914,32 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -982,44 +950,59 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4i8:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-SLOW-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4i8:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-FAST-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1030,52 +1013,73 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: paddw %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v8i8:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-SLOW-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v8i8:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-FAST-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-and-bool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-and-bool.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-and-bool.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-and-bool.ll Tue Aug 6 16:00:43 2019
@@ -118,29 +118,17 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
}
define i1 @trunc_v8i16_v8i1(<8 x i8>) {
-; SSE2-LABEL: trunc_v8i16_v8i1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: cmpb $-1, %al
-; SSE2-NEXT: sete %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: trunc_v8i16_v8i1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $15, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: cmpb $-1, %al
-; SSE41-NEXT: sete %al
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_v8i16_v8i1:
+; SSE: # %bb.0:
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: cmpb $-1, %al
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_v8i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
@@ -150,9 +138,9 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512F-LABEL: trunc_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: cmpb $-1, %al
; AVX512F-NEXT: sete %al
@@ -161,8 +149,8 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512BW-LABEL: trunc_v8i16_v8i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: cmpb $-1, %al
; AVX512BW-NEXT: sete %al
@@ -171,8 +159,8 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512VL-LABEL: trunc_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovb2m %xmm0, %k0
+; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovw2m %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: cmpb $-1, %al
; AVX512VL-NEXT: sete %al
@@ -1070,33 +1058,22 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
}
define i1 @icmp_v8i16_v8i1(<8 x i8>) {
-; SSE2-LABEL: icmp_v8i16_v8i1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: cmpb $-1, %al
-; SSE2-NEXT: sete %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: icmp_v8i16_v8i1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: cmpb $-1, %al
-; SSE41-NEXT: sete %al
-; SSE41-NEXT: retq
+; SSE-LABEL: icmp_v8i16_v8i1:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpeqw %xmm0, %xmm1
+; SSE-NEXT: packsswb %xmm0, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: cmpb $-1, %al
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
;
; AVX-LABEL: icmp_v8i16_v8i1:
; AVX: # %bb.0:
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: cmpb $-1, %al
@@ -1105,10 +1082,11 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
;
; AVX512F-LABEL: icmp_v8i16_v8i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: cmpb $-1, %al
; AVX512F-NEXT: sete %al
@@ -1118,7 +1096,8 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
; AVX512BW-LABEL: icmp_v8i16_v8i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: cmpb $-1, %al
; AVX512BW-NEXT: sete %al
@@ -1127,7 +1106,7 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
;
; AVX512VL-LABEL: icmp_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
+; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: cmpb $-1, %al
; AVX512VL-NEXT: sete %al
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-and.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-and.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-and.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-and.ll Tue Aug 6 16:00:43 2019
@@ -182,14 +182,14 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
@@ -397,8 +397,7 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
@@ -406,7 +405,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -418,10 +417,9 @@ define i16 @test_v2i16(<2 x i16> %a0) {
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
@@ -429,9 +427,9 @@ define i16 @test_v4i16(<4 x i16> %a0) {
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -686,8 +684,7 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -695,8 +692,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -704,7 +700,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@@ -716,11 +712,9 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -728,11 +722,9 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -740,9 +732,9 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@@ -754,13 +746,12 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -768,13 +759,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -782,12 +772,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll Tue Aug 6 16:00:43 2019
@@ -776,31 +776,24 @@ define i64 @test_v16i64(<16 x i64> %a0)
;
define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE2-LABEL: test_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pmuludq %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0)
@@ -1101,25 +1094,24 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pmuludq %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -1128,33 +1120,44 @@ define i16 @test_v2i16(<2 x i16> %a0) {
}
define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v4i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pextrw $0, %xmm1, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmulld %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -1531,34 +1534,32 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmuludq %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1569,53 +1570,42 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmulld %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1626,16 +1616,12 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -1643,17 +1629,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -1661,59 +1642,27 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3]
-; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: test_v8i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0)
ret i8 %1
}
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-or-bool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-or-bool.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-or-bool.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-or-bool.ll Tue Aug 6 16:00:43 2019
@@ -112,29 +112,17 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
}
define i1 @trunc_v8i16_v8i1(<8 x i8>) {
-; SSE2-LABEL: trunc_v8i16_v8i1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: setne %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: trunc_v8i16_v8i1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $15, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
-; SSE41-NEXT: setne %al
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_v8i16_v8i1:
+; SSE: # %bb.0:
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: testb %al, %al
+; SSE-NEXT: setne %al
+; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_v8i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
@@ -144,9 +132,9 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512F-LABEL: trunc_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: setne %al
@@ -155,8 +143,8 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512BW-LABEL: trunc_v8i16_v8i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: setne %al
@@ -165,8 +153,8 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512VL-LABEL: trunc_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovb2m %xmm0, %k0
+; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovw2m %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: testb %al, %al
; AVX512VL-NEXT: setne %al
@@ -1055,33 +1043,22 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
}
define i1 @icmp_v8i16_v8i1(<8 x i8>) {
-; SSE2-LABEL: icmp_v8i16_v8i1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: setne %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: icmp_v8i16_v8i1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
-; SSE41-NEXT: setne %al
-; SSE41-NEXT: retq
+; SSE-LABEL: icmp_v8i16_v8i1:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpeqw %xmm0, %xmm1
+; SSE-NEXT: packsswb %xmm0, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: testb %al, %al
+; SSE-NEXT: setne %al
+; SSE-NEXT: retq
;
; AVX-LABEL: icmp_v8i16_v8i1:
; AVX: # %bb.0:
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: testb %al, %al
@@ -1090,10 +1067,11 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
;
; AVX512F-LABEL: icmp_v8i16_v8i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: setne %al
@@ -1103,7 +1081,8 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
; AVX512BW-LABEL: icmp_v8i16_v8i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: setne %al
@@ -1112,7 +1091,7 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
;
; AVX512VL-LABEL: icmp_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
+; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: testb %al, %al
; AVX512VL-NEXT: setne %al
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-or.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-or.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-or.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-or.ll Tue Aug 6 16:00:43 2019
@@ -182,14 +182,14 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
@@ -397,8 +397,7 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
@@ -406,7 +405,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -418,10 +417,9 @@ define i16 @test_v2i16(<2 x i16> %a0) {
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
@@ -429,9 +427,9 @@ define i16 @test_v4i16(<4 x i16> %a0) {
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -686,8 +684,7 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -695,8 +692,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -704,7 +700,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@@ -716,11 +712,9 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -728,11 +722,9 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -740,9 +732,9 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@@ -754,13 +746,12 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -768,13 +759,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -782,12 +772,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-smax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-smax.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-smax.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-smax.ll Tue Aug 6 16:00:43 2019
@@ -670,35 +670,108 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX2-LABEL: test_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsraq $32, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0)
ret i32 %1
}
@@ -1041,62 +1114,199 @@ define i32 @test_v32i32(<32 x i32> %a0)
;
define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pmaxsw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v2i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllq $48, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: psllq $48, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
+; SSE41-LABEL: test_v2i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: psllq $48, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: psrad $16, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: psllq $48, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: psrad $16, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: retq
;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v2i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsraq $48, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $48, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0)
ret i16 %1
}
define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pmaxsw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmaxsw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v4i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pslld $16, %xmm0
+; SSE41-NEXT: psrad $16, %xmm0
+; SSE41-NEXT: pslld $16, %xmm1
+; SSE41-NEXT: psrad $16, %xmm1
+; SSE41-NEXT: pmaxsd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmaxsd %xmm1, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
+; AVX-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -1384,41 +1594,132 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllq $56, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: psllq $56, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: psllq $56, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: psllq $56, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: psrad $24, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
+; AVX2-LABEL: test_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsraq $56, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $56, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsraq $56, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0)
ret i8 %1
}
@@ -1426,17 +1727,19 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pslld $24, %xmm0
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: pslld $24, %xmm1
+; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
@@ -1446,32 +1749,42 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmaxsb %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pslld $24, %xmm0
+; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pslld $24, %xmm1
+; SSE41-NEXT: psrad $24, %xmm1
+; SSE41-NEXT: pmaxsd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmaxsd %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $24, %xmm0, %xmm1
+; AVX-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm1
+; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1482,64 +1795,82 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaxsb %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: psraw $8, %xmm0
+; SSE41-NEXT: psllw $8, %xmm1
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: pmaxsw %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: psraw $8, %xmm0
+; SSE41-NEXT: pmaxsw %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psllw $8, %xmm1
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: pmaxsw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-smin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-smin.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-smin.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-smin.ll Tue Aug 6 16:00:43 2019
@@ -668,35 +668,108 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: psllq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $32, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX2-LABEL: test_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsraq $32, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0
+; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0)
ret i32 %1
}
@@ -1039,62 +1112,199 @@ define i32 @test_v32i32(<32 x i32> %a0)
;
define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pminsw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v2i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllq $48, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: psllq $48, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
+; SSE41-LABEL: test_v2i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: psllq $48, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: psrad $16, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: psllq $48, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: psrad $16, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: retq
;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v2i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsraq $48, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $48, %zmm0, %zmm0
+; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
+; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0)
ret i16 %1
}
define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pminsw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pminsw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v4i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pslld $16, %xmm0
+; SSE41-NEXT: psrad $16, %xmm0
+; SSE41-NEXT: pslld $16, %xmm1
+; SSE41-NEXT: psrad $16, %xmm1
+; SSE41-NEXT: pminsd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pminsd %xmm1, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
+; AVX-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT: vpminsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -1382,41 +1592,132 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllq $56, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: psllq $56, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: psllq $56, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: psllq $56, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: psrad $24, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
+; AVX2-LABEL: test_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsraq $56, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $56, %zmm0, %zmm0
+; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsraq $56, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
+; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0)
ret i8 %1
}
@@ -1424,17 +1725,19 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pslld $24, %xmm0
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: pslld $24, %xmm1
+; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
@@ -1444,32 +1747,42 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminsb %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pslld $24, %xmm0
+; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pslld $24, %xmm1
+; SSE41-NEXT: psrad $24, %xmm1
+; SSE41-NEXT: pminsd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pminsd %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $24, %xmm0, %xmm1
+; AVX-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm1
+; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT: vpminsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1480,64 +1793,82 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pminsw %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pminsw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pminsw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pminsb %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: psraw $8, %xmm0
+; SSE41-NEXT: psllw $8, %xmm1
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: pminsw %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: psraw $8, %xmm0
+; SSE41-NEXT: pminsw %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psllw $8, %xmm1
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: pminsw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX-NEXT: vpminsw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX512-NEXT: vpminsw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-umax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-umax.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-umax.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-umax.ll Tue Aug 6 16:00:43 2019
@@ -727,38 +727,88 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX2-LABEL: test_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
+; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0)
ret i32 %1
}
@@ -1160,41 +1210,85 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE2-LABEL: test_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaxuw %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0)
ret i16 %1
}
@@ -1202,48 +1296,69 @@ define i16 @test_v2i16(<2 x i16> %a0) {
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE2-LABEL: test_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxuw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaxuw %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7]
+; SSE41-NEXT: pmaxud %xmm0, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -1647,37 +1762,83 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: por %xmm3, %xmm4
+; SSE41-NEXT: por %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: pextrb $0, %xmm2, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0)
ret i8 %1
}
@@ -1685,44 +1846,82 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmaxub %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmaxud %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1733,52 +1932,92 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pmaxsw %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pmaxsw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmaxuw %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmaxuw %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaxub %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmaxuw %xmm1, %xmm0
+; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-umin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-umin.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-umin.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-umin.ll Tue Aug 6 16:00:43 2019
@@ -725,38 +725,87 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
+; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX2-LABEL: test_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
+; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpminuq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0)
ret i32 %1
}
@@ -1158,41 +1207,85 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE2-LABEL: test_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pminuw %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512VL-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0)
ret i16 %1
}
@@ -1200,48 +1293,70 @@ define i16 @test_v2i16(<2 x i16> %a0) {
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE2-LABEL: test_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminuw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7]
+; SSE41-NEXT: pminud %xmm0, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: pminud %xmm1, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -1552,37 +1667,83 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: pextrb $0, %xmm2, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0)
ret i8 %1
}
@@ -1590,44 +1751,83 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pminud %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pminud %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpminud %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
@@ -1638,52 +1838,92 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pminsw %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pminsw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pminuw %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pminuw %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpminuw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-xor-bool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-xor-bool.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-xor-bool.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-xor-bool.ll Tue Aug 6 16:00:43 2019
@@ -118,29 +118,17 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
}
define i1 @trunc_v8i16_v8i1(<8 x i8>) {
-; SSE2-LABEL: trunc_v8i16_v8i1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: xorb $0, %al
-; SSE2-NEXT: setnp %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: trunc_v8i16_v8i1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $15, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: xorb $0, %al
-; SSE41-NEXT: setnp %al
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_v8i16_v8i1:
+; SSE: # %bb.0:
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: xorb $0, %al
+; SSE-NEXT: setnp %al
+; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_v8i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
@@ -150,9 +138,9 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512F-LABEL: trunc_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: xorb $0, %al
; AVX512F-NEXT: setnp %al
@@ -161,8 +149,8 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512BW-LABEL: trunc_v8i16_v8i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: xorb $0, %al
; AVX512BW-NEXT: setnp %al
@@ -171,8 +159,8 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
;
; AVX512VL-LABEL: trunc_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovb2m %xmm0, %k0
+; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovw2m %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: xorb $0, %al
; AVX512VL-NEXT: setnp %al
@@ -1170,33 +1158,22 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
}
define i1 @icmp_v8i16_v8i1(<8 x i8>) {
-; SSE2-LABEL: icmp_v8i16_v8i1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: xorb $0, %al
-; SSE2-NEXT: setnp %al
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: icmp_v8i16_v8i1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: xorb $0, %al
-; SSE41-NEXT: setnp %al
-; SSE41-NEXT: retq
+; SSE-LABEL: icmp_v8i16_v8i1:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpeqw %xmm0, %xmm1
+; SSE-NEXT: packsswb %xmm0, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: xorb $0, %al
+; SSE-NEXT: setnp %al
+; SSE-NEXT: retq
;
; AVX-LABEL: icmp_v8i16_v8i1:
; AVX: # %bb.0:
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: xorb $0, %al
@@ -1205,10 +1182,11 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
;
; AVX512F-LABEL: icmp_v8i16_v8i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: xorb $0, %al
; AVX512F-NEXT: setnp %al
@@ -1218,7 +1196,8 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
; AVX512BW-LABEL: icmp_v8i16_v8i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: xorb $0, %al
; AVX512BW-NEXT: setnp %al
@@ -1227,7 +1206,7 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
;
; AVX512VL-LABEL: icmp_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
+; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: xorb $0, %al
; AVX512VL-NEXT: setnp %al
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-xor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-xor.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-xor.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-xor.ll Tue Aug 6 16:00:43 2019
@@ -182,14 +182,14 @@ define i64 @test_v16i64(<16 x i64> %a0)
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
@@ -397,8 +397,7 @@ define i32 @test_v32i32(<32 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
@@ -406,7 +405,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -418,10 +417,9 @@ define i16 @test_v2i16(<2 x i16> %a0) {
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
@@ -429,9 +427,9 @@ define i16 @test_v4i16(<4 x i16> %a0) {
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -686,8 +684,7 @@ define i16 @test_v64i16(<64 x i16> %a0)
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -695,8 +692,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -704,7 +700,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@@ -716,11 +712,9 @@ define i8 @test_v2i8(<2 x i8> %a0) {
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -728,11 +722,9 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -740,9 +732,9 @@ define i8 @test_v4i8(<4 x i8> %a0) {
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@@ -754,13 +746,12 @@ define i8 @test_v4i8(<4 x i8> %a0) {
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
@@ -768,13 +759,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@@ -782,12 +772,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-sext.ll?rev=368107&r1=368106&r2=368107&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-sext.ll Tue Aug 6 16:00:43 2019
@@ -3064,15 +3064,23 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A
;
; X32-SSE2-LABEL: sext_2i8_to_i32:
; X32-SSE2: # %bb.0: # %entry
+; X32-SSE2-NEXT: pushl %eax
+; X32-SSE2-NEXT: .cfi_def_cfa_offset 8
; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE2-NEXT: psraw $8, %xmm0
; X32-SSE2-NEXT: movd %xmm0, %eax
+; X32-SSE2-NEXT: popl %ecx
+; X32-SSE2-NEXT: .cfi_def_cfa_offset 4
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_2i8_to_i32:
; X32-SSE41: # %bb.0: # %entry
+; X32-SSE41-NEXT: pushl %eax
+; X32-SSE41-NEXT: .cfi_def_cfa_offset 8
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: movd %xmm0, %eax
+; X32-SSE41-NEXT: popl %ecx
+; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl
entry:
%Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -3171,71 +3179,84 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x
define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; SSE2-LABEL: sext_4i8_to_4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: pslld $24, %xmm0
+; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i8_to_4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm1
+; SSSE3-NEXT: pslld $24, %xmm0
+; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i8_to_4i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
+; SSE41-NEXT: pslld $24, %xmm0
+; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_4i8_to_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_4i8_to_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
+; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_4i8_to_4i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE2-LABEL: sext_4i8_to_4i64:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm1
+; X32-SSE2-NEXT: pslld $24, %xmm0
+; X32-SSE2-NEXT: psrad $24, %xmm0
; X32-SSE2-NEXT: pxor %xmm2, %xmm2
+; X32-SSE2-NEXT: pxor %xmm3, %xmm3
+; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_4i8_to_4i64:
; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; X32-SSE41-NEXT: psrld $16, %xmm0
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1
+; X32-SSE41-NEXT: pslld $24, %xmm0
+; X32-SSE41-NEXT: psrad $24, %xmm0
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
%extmask = sext <4 x i8> %mask to <4 x i64>
@@ -3325,7 +3346,8 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: paddq %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_2i8_to_2i32:
@@ -3335,23 +3357,20 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: paddd %xmm0, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-NEXT: paddq %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_2i8_to_2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: paddd %xmm0, %xmm0
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: paddq %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_2i8_to_2i32:
; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE2-LABEL: sext_2i8_to_2i32:
@@ -3362,16 +3381,15 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x
; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: paddd %xmm0, %xmm0
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-SSE2-NEXT: paddq %xmm0, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_2i8_to_2i32:
; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movzwl (%eax), %eax
-; X32-SSE41-NEXT: movd %eax, %xmm0
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; X32-SSE41-NEXT: paddd %xmm0, %xmm0
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: paddq %xmm0, %xmm0
; X32-SSE41-NEXT: retl
%x = load <2 x i8>, <2 x i8>* %addr, align 1
%y = sext <2 x i8> %x to <2 x i32>
@@ -3797,8 +3815,8 @@ entry:
define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
; SSE2-LABEL: zext_negate_sext:
; SSE2: # %bb.0:
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: psubw %xmm0, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: psrad $16, %xmm0
@@ -3808,8 +3826,8 @@ define <8 x i32> @zext_negate_sext(<8 x
;
; SSSE3-LABEL: zext_negate_sext:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: psubw %xmm0, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: psrad $16, %xmm0
@@ -3819,7 +3837,7 @@ define <8 x i32> @zext_negate_sext(<8 x
;
; SSE41-LABEL: zext_negate_sext:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
@@ -3829,33 +3847,35 @@ define <8 x i32> @zext_negate_sext(<8 x
;
; AVX1-LABEL: zext_negate_sext:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_negate_sext:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_negate_sext:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE2-LABEL: zext_negate_sext:
; X32-SSE2: # %bb.0:
+; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X32-SSE2-NEXT: psubw %xmm0, %xmm1
; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X32-SSE2-NEXT: psrad $16, %xmm0
@@ -3865,7 +3885,7 @@ define <8 x i32> @zext_negate_sext(<8 x
;
; X32-SSE41-LABEL: zext_negate_sext:
; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE41-NEXT: pxor %xmm1, %xmm1
; X32-SSE41-NEXT: psubw %xmm0, %xmm1
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
@@ -3881,8 +3901,7 @@ define <8 x i32> @zext_negate_sext(<8 x
define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
; SSE2-LABEL: zext_decremenet_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3893,8 +3912,7 @@ define <8 x i32> @zext_decremenet_sext(<
;
; SSSE3-LABEL: zext_decremenet_sext:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: paddw %xmm0, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3905,7 +3923,7 @@ define <8 x i32> @zext_decremenet_sext(<
;
; SSE41-LABEL: zext_decremenet_sext:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
@@ -3915,33 +3933,35 @@ define <8 x i32> @zext_decremenet_sext(<
;
; AVX1-LABEL: zext_decremenet_sext:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_decremenet_sext:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_decremenet_sext:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE2-LABEL: zext_decremenet_sext:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE2-NEXT: paddw %xmm0, %xmm1
; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3952,7 +3972,7 @@ define <8 x i32> @zext_decremenet_sext(<
;
; X32-SSE41-LABEL: zext_decremenet_sext:
; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE41-NEXT: paddw %xmm0, %xmm1
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
More information about the llvm-commits
mailing list