[llvm] r279065 - [X86][SSE] Add SSE1 tests to make sure we don't merge loads on illegal types

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 18 06:41:26 PDT 2016


Author: rksimon
Date: Thu Aug 18 08:41:26 2016
New Revision: 279065

URL: http://llvm.org/viewvc/llvm-project?rev=279065&view=rev
Log:
[X86][SSE] Add SSE1 tests to make sure we don't merge loads on illegal types

Modified:
    llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll

Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll?rev=279065&r1=279064&r2=279065&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll Thu Aug 18 08:41:26 2016
@@ -5,8 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
 ;
-; Just one 32-bit run to make sure we do reasonable things.
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+; 32-bit SSE tests to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE41
 
 define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
 ; SSE-LABEL: merge_2f64_f64_23:
@@ -19,11 +20,19 @@ define <2 x double> @merge_2f64_f64_23(d
 ; AVX-NEXT:    vmovups 16(%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_2f64_f64_23:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movups 16(%eax), %xmm0
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_2f64_f64_23:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    fldl 16(%eax)
+; X32-SSE1-NEXT:    fldl 24(%eax)
+; X32-SSE1-NEXT:    fxch %st(1)
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_2f64_f64_23:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 16(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
   %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
   %val0 = load double, double* %ptr0
@@ -44,11 +53,37 @@ define <2 x i64> @merge_2i64_i64_12(i64*
 ; AVX-NEXT:    vmovups 8(%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_2i64_i64_12:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movups 8(%eax), %xmm0
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_2i64_i64_12:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Ltmp0:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Ltmp1:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Ltmp2:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -12
+; X32-SSE1-NEXT:  .Ltmp3:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 8(%ecx), %edx
+; X32-SSE1-NEXT:    movl 12(%ecx), %esi
+; X32-SSE1-NEXT:    movl 16(%ecx), %edi
+; X32-SSE1-NEXT:    movl 20(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE1-NEXT:    movl %edi, 8(%eax)
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_2i64_i64_12:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 8(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
   %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
   %val0 = load i64, i64* %ptr0
@@ -123,11 +158,19 @@ define <4 x float> @merge_4f32_f32_34uu(
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_34uu:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_34uu:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_34uu:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
   %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
   %val0 = load float, float* %ptr0
@@ -159,13 +202,22 @@ define <4 x float> @merge_4f32_f32_34z6(
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_34z6:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movups 12(%eax), %xmm1
-; X32-SSE-NEXT:    xorps %xmm0, %xmm0
-; X32-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_34z6:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movups 12(%eax), %xmm0
+; X32-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X32-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; X32-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_34z6:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 12(%eax), %xmm1
+; X32-SSE41-NEXT:    xorps %xmm0, %xmm0
+; X32-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
   %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
   %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
@@ -189,11 +241,21 @@ define <4 x float> @merge_4f32_f32_45zz(
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_45zz:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_45zz:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_45zz:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
   %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
   %val0 = load float, float* %ptr0
@@ -225,12 +287,22 @@ define <4 x float> @merge_4f32_f32_012u(
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_012u:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_012u:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_012u:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
   %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
   %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
@@ -266,12 +338,22 @@ define <4 x float> @merge_4f32_f32_019u(
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_019u:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_019u:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_019u:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
   %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
   %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
@@ -296,11 +378,29 @@ define <4 x i32> @merge_4i32_i32_23u5(i3
 ; AVX-NEXT:    vmovups 8(%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4i32_i32_23u5:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movups 8(%eax), %xmm0
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4i32_i32_23u5:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Ltmp4:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:  .Ltmp5:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 8(%ecx), %edx
+; X32-SSE1-NEXT:    movl 12(%ecx), %esi
+; X32-SSE1-NEXT:    movl 20(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_23u5:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 8(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
   %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
   %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
@@ -324,11 +424,20 @@ define <4 x i32> @merge_4i32_i32_3zuu(i3
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4i32_i32_3zuu:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4i32_i32_3zuu:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 12(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, (%eax)
+; X32-SSE1-NEXT:    movl $0, 4(%eax)
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_3zuu:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
   %val0 = load i32, i32* %ptr0
   %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
@@ -347,11 +456,21 @@ define <4 x i32> @merge_4i32_i32_34uu(i3
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4i32_i32_34uu:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4i32_i32_34uu:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 12(%ecx), %edx
+; X32-SSE1-NEXT:    movl 16(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_34uu:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
   %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
   %val0 = load i32, i32* %ptr0
@@ -372,11 +491,23 @@ define <4 x i32> @merge_4i32_i32_45zz(i3
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4i32_i32_45zz:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4i32_i32_45zz:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 16(%ecx), %edx
+; X32-SSE1-NEXT:    movl 20(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    movl $0, 12(%eax)
+; X32-SSE1-NEXT:    movl $0, 8(%eax)
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_45zz:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
   %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
   %val0 = load i32, i32* %ptr0
@@ -397,11 +528,53 @@ define <8 x i16> @merge_8i16_i16_23u567u
 ; AVX-NEXT:    vmovups 4(%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_8i16_i16_23u567u9:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movups 4(%eax), %xmm0
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %ebp
+; X32-SSE1-NEXT:  .Ltmp6:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %ebx
+; X32-SSE1-NEXT:  .Ltmp7:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Ltmp8:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 16
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Ltmp9:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 20
+; X32-SSE1-NEXT:  .Ltmp10:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -20
+; X32-SSE1-NEXT:  .Ltmp11:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -16
+; X32-SSE1-NEXT:  .Ltmp12:
+; X32-SSE1-NEXT:    .cfi_offset %ebx, -12
+; X32-SSE1-NEXT:  .Ltmp13:
+; X32-SSE1-NEXT:    .cfi_offset %ebp, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movzwl 4(%ecx), %edx
+; X32-SSE1-NEXT:    movzwl 6(%ecx), %esi
+; X32-SSE1-NEXT:    movzwl 10(%ecx), %edi
+; X32-SSE1-NEXT:    movzwl 12(%ecx), %ebx
+; X32-SSE1-NEXT:    movzwl 14(%ecx), %ebp
+; X32-SSE1-NEXT:    movzwl 18(%ecx), %ecx
+; X32-SSE1-NEXT:    movw %bp, 10(%eax)
+; X32-SSE1-NEXT:    movw %bx, 8(%eax)
+; X32-SSE1-NEXT:    movw %cx, 14(%eax)
+; X32-SSE1-NEXT:    movw %si, 2(%eax)
+; X32-SSE1-NEXT:    movw %dx, (%eax)
+; X32-SSE1-NEXT:    movw %di, 6(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    popl %ebx
+; X32-SSE1-NEXT:    popl %ebp
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 4(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
   %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
   %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
@@ -434,11 +607,21 @@ define <8 x i16> @merge_8i16_i16_34uuuuu
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_8i16_i16_34uuuuuu:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movzwl 6(%ecx), %edx
+; X32-SSE1-NEXT:    movzwl 8(%ecx), %ecx
+; X32-SSE1-NEXT:    movw %cx, 2(%eax)
+; X32-SSE1-NEXT:    movw %dx, (%eax)
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
   %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
   %val0 = load i16, i16* %ptr0
@@ -459,11 +642,33 @@ define <8 x i16> @merge_8i16_i16_45u7zzz
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_8i16_i16_45u7zzzz:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Ltmp14:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:  .Ltmp15:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movzwl 8(%ecx), %edx
+; X32-SSE1-NEXT:    movzwl 10(%ecx), %esi
+; X32-SSE1-NEXT:    movzwl 14(%ecx), %ecx
+; X32-SSE1-NEXT:    movw %si, 2(%eax)
+; X32-SSE1-NEXT:    movw %dx, (%eax)
+; X32-SSE1-NEXT:    movw %cx, 6(%eax)
+; X32-SSE1-NEXT:    movw $0, 14(%eax)
+; X32-SSE1-NEXT:    movw $0, 12(%eax)
+; X32-SSE1-NEXT:    movw $0, 10(%eax)
+; X32-SSE1-NEXT:    movw $0, 8(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
   %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
   %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
@@ -491,11 +696,73 @@ define <16 x i8> @merge_16i8_i8_01u34567
 ; AVX-NEXT:    vmovups (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movups (%eax), %xmm0
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %ebx
+; X32-SSE1-NEXT:  .Ltmp16:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    subl $12, %esp
+; X32-SSE1-NEXT:  .Ltmp17:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 20
+; X32-SSE1-NEXT:  .Ltmp18:
+; X32-SSE1-NEXT:    .cfi_offset %ebx, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movb (%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 1(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 3(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 4(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 5(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 6(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 7(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 8(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 9(%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 10(%ecx), %bh
+; X32-SSE1-NEXT:    movb 11(%ecx), %bl
+; X32-SSE1-NEXT:    movb 12(%ecx), %dh
+; X32-SSE1-NEXT:    movb 13(%ecx), %dl
+; X32-SSE1-NEXT:    movb 15(%ecx), %cl
+; X32-SSE1-NEXT:    movb %dl, 13(%eax)
+; X32-SSE1-NEXT:    movb %dh, 12(%eax)
+; X32-SSE1-NEXT:    movb %cl, 15(%eax)
+; X32-SSE1-NEXT:    movb %bl, 11(%eax)
+; X32-SSE1-NEXT:    movb %bh, 10(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 9(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 8(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 7(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 6(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 5(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 4(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 1(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, (%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, 3(%eax)
+; X32-SSE1-NEXT:    addl $12, %esp
+; X32-SSE1-NEXT:    popl %ebx
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
   %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
   %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
@@ -552,11 +819,28 @@ define <16 x i8> @merge_16i8_i8_01u3uuzz
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movb (%ecx), %dl
+; X32-SSE1-NEXT:    movb 1(%ecx), %dh
+; X32-SSE1-NEXT:    movb 3(%ecx), %cl
+; X32-SSE1-NEXT:    movb %dh, 1(%eax)
+; X32-SSE1-NEXT:    movb %dl, (%eax)
+; X32-SSE1-NEXT:    movb %cl, 3(%eax)
+; X32-SSE1-NEXT:    movb $0, 15(%eax)
+; X32-SSE1-NEXT:    movb $0, 14(%eax)
+; X32-SSE1-NEXT:    movb $0, 13(%eax)
+; X32-SSE1-NEXT:    movb $0, 7(%eax)
+; X32-SSE1-NEXT:    movb $0, 6(%eax)
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
   %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
   %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
@@ -585,11 +869,44 @@ define <16 x i8> @merge_16i8_i8_0123uu67
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %ebx
+; X32-SSE1-NEXT:  .Ltmp19:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %eax
+; X32-SSE1-NEXT:  .Ltmp20:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Ltmp21:
+; X32-SSE1-NEXT:    .cfi_offset %ebx, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movb (%ecx), %dl
+; X32-SSE1-NEXT:    movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
+; X32-SSE1-NEXT:    movb 1(%ecx), %dh
+; X32-SSE1-NEXT:    movb 2(%ecx), %bl
+; X32-SSE1-NEXT:    movb 3(%ecx), %bh
+; X32-SSE1-NEXT:    movb 6(%ecx), %dl
+; X32-SSE1-NEXT:    movb 7(%ecx), %cl
+; X32-SSE1-NEXT:    movb %cl, 7(%eax)
+; X32-SSE1-NEXT:    movb %dl, 6(%eax)
+; X32-SSE1-NEXT:    movb %bh, 3(%eax)
+; X32-SSE1-NEXT:    movb %bl, 2(%eax)
+; X32-SSE1-NEXT:    movb %dh, 1(%eax)
+; X32-SSE1-NEXT:    movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
+; X32-SSE1-NEXT:    movb %cl, (%eax)
+; X32-SSE1-NEXT:    movb $0, 15(%eax)
+; X32-SSE1-NEXT:    movb $0, 14(%eax)
+; X32-SSE1-NEXT:    movb $0, 13(%eax)
+; X32-SSE1-NEXT:    addl $4, %esp
+; X32-SSE1-NEXT:    popl %ebx
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
   %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
   %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
@@ -639,13 +956,24 @@ define void @merge_4i32_i32_combine(<4 x
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4i32_i32_combine:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    movaps %xmm0, (%eax)
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4i32_i32_combine:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl (%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, (%eax)
+; X32-SSE1-NEXT:    movl $0, 12(%eax)
+; X32-SSE1-NEXT:    movl $0, 8(%eax)
+; X32-SSE1-NEXT:    movl $0, 4(%eax)
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4i32_i32_combine:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    movaps %xmm0, (%eax)
+; X32-SSE41-NEXT:    retl
  %1 = getelementptr i32, i32* %src, i32 0
  %2 = load i32, i32* %1
  %3 = insertelement <4 x i32> undef, i32 %2, i32 0
@@ -675,14 +1003,40 @@ define <2 x i64> @merge_2i64_i64_12_vola
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_2i64_i64_12_volatile:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    pinsrd $1, 12(%eax), %xmm0
-; X32-SSE-NEXT:    pinsrd $2, 16(%eax), %xmm0
-; X32-SSE-NEXT:    pinsrd $3, 20(%eax), %xmm0
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Ltmp22:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Ltmp23:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Ltmp24:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -12
+; X32-SSE1-NEXT:  .Ltmp25:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 8(%ecx), %edx
+; X32-SSE1-NEXT:    movl 12(%ecx), %esi
+; X32-SSE1-NEXT:    movl 16(%ecx), %edi
+; X32-SSE1-NEXT:    movl 20(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE1-NEXT:    movl %edi, 8(%eax)
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_2i64_i64_12_volatile:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    pinsrd $1, 12(%eax), %xmm0
+; X32-SSE41-NEXT:    pinsrd $2, 16(%eax), %xmm0
+; X32-SSE41-NEXT:    pinsrd $3, 20(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
   %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
   %val0 = load volatile i64, i64* %ptr0
@@ -720,14 +1074,26 @@ define <4 x float> @merge_4f32_f32_2345_
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_2345_volatile:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_2345_volatile:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; X32-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
   %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
   %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
@@ -764,15 +1130,25 @@ define <4 x float> @merge_4f32_f32_X0YY(
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
 ;
-; X32-SSE-LABEL: merge_4f32_f32_X0YY:
-; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE-NEXT:    retl
+; X32-SSE1-LABEL: merge_4f32_f32_X0YY:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    retl
+;
+; X32-SSE41-LABEL: merge_4f32_f32_X0YY:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X32-SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE41-NEXT:    retl
   %val0 = load float, float* %ptr0, align 4
   %val1 = load float, float* %ptr1, align 4
   %res0 = insertelement <4 x float> undef, float %val0, i32 0




More information about the llvm-commits mailing list