[llvm] r343097 - [X86][SSE] Refresh PR34947 test code to handle D52504

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 26 04:53:51 PDT 2018


Author: rksimon
Date: Wed Sep 26 04:53:51 2018
New Revision: 343097

URL: http://llvm.org/viewvc/llvm-project?rev=343097&view=rev
Log:
[X86][SSE] Refresh PR34947 test code to handle D52504

The previously reduced version used urem <9 x i32> zeroinitializer, %tmp which D52504 will simplify.

Modified:
    llvm/trunk/test/CodeGen/X86/shrink_vmul.ll

Modified: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shrink_vmul.ll?rev=343097&r1=343096&r2=343097&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll Wed Sep 26 04:53:51 2018
@@ -2196,46 +2196,106 @@ entry:
 ; Illegal Types
 ;
 
-define void @PR34947() {
+define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-SSE-LABEL: PR34947:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa (%eax), %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; X86-SSE-NEXT:    movd %xmm1, %ecx
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE-NEXT:    .cfi_offset %esi, -8
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movdqa (%eax), %xmm5
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdqa (%ecx), %xmm3
+; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm6
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm0, %esi
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X86-SSE-NEXT:    movd %xmm2, %ecx
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm2, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm2, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm7
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; X86-SSE-NEXT:    movd %xmm5, %eax
+; X86-SSE-NEXT:    movd %xmm6, %esi
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
+; X86-SSE-NEXT:    divl %esi
 ; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE-NEXT:    movd %xmm0, %ecx
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm5, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm5, %esi
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-SSE-NEXT:    movd %xmm0, %ecx
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm5
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm6, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
+; X86-SSE-NEXT:    movd %xmm6, %esi
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm6
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm7, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
+; X86-SSE-NEXT:    movd %xmm7, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm7
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; X86-SSE-NEXT:    movd %xmm4, %eax
+; X86-SSE-NEXT:    movd %xmm3, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm4, %eax
+; X86-SSE-NEXT:    movd %edx, %xmm4
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; X86-SSE-NEXT:    movd %xmm3, %esi
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl %esi
+; X86-SSE-NEXT:    movd %edx, %xmm3
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm6[0,0]
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm4
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl (%eax)
+; X86-SSE-NEXT:    divl 32(%ecx)
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm5
 ; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-SSE-NEXT:    movd %eax, %xmm2
 ; X86-SSE-NEXT:    pmuludq %xmm0, %xmm2
 ; X86-SSE-NEXT:    movd %xmm2, (%eax)
 ; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
+; X86-SSE-NEXT:    movdqa %xmm4, (%eax)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: PR34947:
@@ -2254,66 +2314,74 @@ define void @PR34947() {
 ; X86-AVX1-NEXT:    .cfi_offset %edi, -16
 ; X86-AVX1-NEXT:    .cfi_offset %ebx, -12
 ; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
-; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm0
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm2
+; X86-AVX1-NEXT:    vmovdqa (%ecx), %ymm1
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; X86-AVX1-NEXT:    vmovd %xmm2, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl (%eax)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    divl 32(%ecx)
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
+; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
+; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vmovd %xmm0, %ecx
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X86-AVX1-NEXT:    vmovd %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, %ebp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
+; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ecx
-; X86-AVX1-NEXT:    xorl %eax, %eax
+; X86-AVX1-NEXT:    movl %edx, %ebx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %esi
+; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
+; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
 ; X86-AVX1-NEXT:    divl %esi
 ; X86-AVX1-NEXT:    movl %edx, %esi
-; X86-AVX1-NEXT:    xorl %eax, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edi
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX1-NEXT:    divl %edi
 ; X86-AVX1-NEXT:    movl %edx, %edi
-; X86-AVX1-NEXT:    xorl %eax, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vmovd %xmm0, %ebx
-; X86-AVX1-NEXT:    divl %ebx
+; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    vmovd %edx, %xmm0
 ; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
 ; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-AVX1-NEXT:    vmovd %eax, %xmm3
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
-; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
@@ -2333,109 +2401,174 @@ define void @PR34947() {
 ;
 ; X86-AVX2-LABEL: PR34947:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    pushl %edi
 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
-; X86-AVX2-NEXT:    vmovdqa (%eax), %ymm0
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    .cfi_def_cfa_offset 12
+; X86-AVX2-NEXT:    .cfi_offset %esi, -12
+; X86-AVX2-NEXT:    .cfi_offset %edi, -8
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vmovdqa (%eax), %ymm2
+; X86-AVX2-NEXT:    vmovdqa (%esi), %ymm1
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; X86-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; X86-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm1, %esi
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    vmovd %xmm3, %edi
+; X86-AVX2-NEXT:    vmovd %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %esi
-; X86-AVX2-NEXT:    vmovd %edx, %xmm2
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    divl %edi
+; X86-AVX2-NEXT:    vmovd %edx, %xmm5
+; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X86-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
+; X86-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X86-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
+; X86-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm1
-; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
+; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
+; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm0, %esi
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    vmovd %xmm1, %edi
+; X86-AVX2-NEXT:    vmovd %xmm2, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %esi
-; X86-AVX2-NEXT:    vmovd %edx, %xmm2
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    divl %edi
+; X86-AVX2-NEXT:    vmovd %edx, %xmm4
+; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
+; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
+; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
+; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    xorl %eax, %eax
+; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl (%eax)
-; X86-AVX2-NEXT:    vmovd %edx, %xmm1
+; X86-AVX2-NEXT:    divl 32(%esi)
+; X86-AVX2-NEXT:    vmovd %edx, %xmm0
 ; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X86-AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-AVX2-NEXT:    vmovd %eax, %xmm2
-; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vmovd %xmm1, (%eax)
-; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
+; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, (%eax)
+; X86-AVX2-NEXT:    vmovdqa %ymm1, (%eax)
 ; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT:    popl %edi
 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE-LABEL: PR34947:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movdqa (%rax), %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; X64-SSE-NEXT:    movd %xmm1, %ecx
-; X64-SSE-NEXT:    xorl %eax, %eax
+; X64-SSE-NEXT:    movdqa (%rdi), %xmm5
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movdqa (%rsi), %xmm2
+; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm6
+; X64-SSE-NEXT:    pxor %xmm0, %xmm0
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-SSE-NEXT:    movdqa %xmm5, %xmm4
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm0, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm0, %ecx
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X64-SSE-NEXT:    movd %xmm2, %ecx
-; X64-SSE-NEXT:    xorl %eax, %eax
+; X64-SSE-NEXT:    movd %edx, %xmm8
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm3, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm3, %ecx
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    xorl %eax, %eax
+; X64-SSE-NEXT:    movd %edx, %xmm7
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; X64-SSE-NEXT:    movd %xmm5, %eax
+; X64-SSE-NEXT:    movd %xmm6, %ecx
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    xorl %eax, %eax
+; X64-SSE-NEXT:    movd %edx, %xmm3
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm5, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm5, %ecx
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-SSE-NEXT:    xorl %eax, %eax
+; X64-SSE-NEXT:    movd %edx, %xmm5
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm6, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
+; X64-SSE-NEXT:    movd %xmm6, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm6
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm7, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
+; X64-SSE-NEXT:    movd %xmm7, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm7
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; X64-SSE-NEXT:    movd %xmm4, %eax
+; X64-SSE-NEXT:    movd %xmm2, %ecx
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl (%rax)
+; X64-SSE-NEXT:    divl %ecx
 ; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm4, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X64-SSE-NEXT:    movd %xmm2, %ecx
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl %ecx
+; X64-SSE-NEXT:    movd %edx, %xmm2
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl 32(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm1
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm2
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm3
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
+; X64-SSE-NEXT:    pmuludq %xmm4, %xmm5
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-SSE-NEXT:    movl $8199, %eax # imm = 0x2007
-; X64-SSE-NEXT:    movd %eax, %xmm2
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT:    movd %xmm2, (%rax)
-; X64-SSE-NEXT:    movdqa %xmm1, (%rax)
+; X64-SSE-NEXT:    movd %eax, %xmm3
+; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
+; X64-SSE-NEXT:    movd %xmm3, (%rax)
+; X64-SSE-NEXT:    movdqa %xmm2, (%rax)
+; X64-SSE-NEXT:    movdqa %xmm0, (%rax)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: PR34947:
@@ -2446,49 +2579,55 @@ define void @PR34947() {
 ; X64-AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; X64-AVX1-NEXT:    .cfi_offset %rbx, -24
 ; X64-AVX1-NEXT:    .cfi_offset %rbp, -16
-; X64-AVX1-NEXT:    vmovdqa (%rax), %ymm0
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vmovdqa (%rdi), %ymm2
+; X64-AVX1-NEXT:    vmovdqa (%rsi), %ymm1
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; X64-AVX1-NEXT:    vmovd %xmm2, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl (%rax)
+; X64-AVX1-NEXT:    divl 32(%rsi)
 ; X64-AVX1-NEXT:    movl %edx, %r8d
-; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
+; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r9d
-; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r10d
-; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
+; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r11d
-; X64-AVX1-NEXT:    vmovd %xmm0, %ecx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X64-AVX1-NEXT:    vmovd %xmm3, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %esi
-; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
+; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %edi
-; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %ecx
-; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ebx
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
+; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebx
 ; X64-AVX1-NEXT:    movl %edx, %ebx
-; X64-AVX1-NEXT:    vmovd %xmm0, %ebp
-; X64-AVX1-NEXT:    xorl %eax, %eax
+; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebp
 ; X64-AVX1-NEXT:    vmovd %edx, %xmm0
@@ -2496,12 +2635,12 @@ define void @PR34947() {
 ; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %esi, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X64-AVX1-NEXT:    vmovd %r8d, %xmm1
 ; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
@@ -2518,66 +2657,73 @@ define void @PR34947() {
 ;
 ; X64-AVX2-LABEL: PR34947:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqa (%rax), %ymm0
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vmovdqa (%rdi), %ymm2
+; X64-AVX2-NEXT:    vmovdqa (%rsi), %ymm1
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; X64-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; X64-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm1, %esi
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vmovd %xmm3, %edi
+; X64-AVX2-NEXT:    vmovd %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %esi
-; X64-AVX2-NEXT:    vmovd %edx, %xmm2
-; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    divl %edi
+; X64-AVX2-NEXT:    vmovd %edx, %xmm5
+; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
+; X64-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
+; X64-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm1
-; X64-AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
+; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
+; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm0, %esi
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vmovd %xmm1, %edi
+; X64-AVX2-NEXT:    vmovd %xmm2, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %esi
-; X64-AVX2-NEXT:    vmovd %edx, %xmm2
-; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    divl %edi
+; X64-AVX2-NEXT:    vmovd %edx, %xmm4
+; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
+; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
+; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; X64-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl (%rax)
-; X64-AVX2-NEXT:    vmovd %edx, %xmm1
+; X64-AVX2-NEXT:    divl 32(%rsi)
+; X64-AVX2-NEXT:    vmovd %edx, %xmm0
 ; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X64-AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X64-AVX2-NEXT:    vmovd %eax, %xmm2
-; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vmovd %xmm1, (%rax)
-; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
+; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, (%rax)
+; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
-  %tmp = load <9 x i32>, <9 x i32>* undef, align 64
-  %rem = urem <9 x i32> zeroinitializer, %tmp
+  %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
+  %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
+  %ext0 = zext <9 x i16> %a0 to <9 x i32>
+  %rem = urem <9 x i32> %ext0, %a1
   %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
   store <9 x i32> %mul, <9 x i32>* undef, align 64
   ret void




More information about the llvm-commits mailing list