[llvm] r270081 - [X86][SSE] Added fast-isel tests to sync with clang/test/CodeGen/sse-builtins.c

Thu May 19 09:55:53 PDT 2016

Author: rksimon
Date: Thu May 19 11:55:52 2016
New Revision: 270081

URL: http://llvm.org/viewvc/llvm-project?rev=270081&view=rev
Log:
[X86][SSE] Added fast-isel tests to sync with clang/test/CodeGen/sse-builtins.c

Added:
    llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
    llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll

Added: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll?rev=270081&view=auto
==============================================================================

--- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll Thu May 19 11:55:52 2016
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
+
+define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
+; X64-LABEL: test_mm_cvtsi64_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsi2ssq %rdi, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %cvt = sitofp i64 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
+; X64-LABEL: test_mm_cvtss_si64:
+; X64:       # BB#0:
+; X64-NEXT:    cvtss2si %xmm0, %rax
+; X64-NEXT:    retq
+  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
+; X64-LABEL: test_mm_cvttss_si64:
+; X64:       # BB#0:
+; X64-NEXT:    cvttss2si %xmm0, %rax
+; X64-NEXT:    retq
+  %cvt = extractelement <4 x float> %a0, i32 0
+  %res = fptosi float %cvt to i64
+  ret i64 %res
+}

Added: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll?rev=270081&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll Thu May 19 11:55:52 2016
@@ -0,0 +1,2280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
+
+define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_add_ps:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_ps:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fadd <4 x float> %a0, %a1
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_add_ss:
+; X32:       # BB#0:
+; X32-NEXT:    addss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_ss:
+; X64:       # BB#0:
+; X64-NEXT:    addss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <4 x float> %a0, i32 0
+  %ext1 = extractelement <4 x float> %a1, i32 0
+  %fadd = fadd float %ext0, %ext1
+  %res = insertelement <4 x float> %a0, float %fadd, i32 0
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_and_ps:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_and_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    andl %eax, %edx
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    andl %r8d, %ecx
+; X64-NEXT:    shrq $32, %r8
+; X64-NEXT:    shrq $32, %rsi
+; X64-NEXT:    shrq $32, %rdi
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl %r8d, %edi
+; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl %eax, %esi
+; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    retq
+  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+  %res = and <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <4 x float>
+  ret <4 x float> %bc
+}
+
+define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_ps:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    notl %edx
+; X32-NEXT:    notl %ecx
+; X32-NEXT:    notl %esi
+; X32-NEXT:    notl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_andnot_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    shrq $32, %rsi
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    notl %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    shrq $32, %rdi
+; X64-NEXT:    notl %ecx
+; X64-NEXT:    andl %r8d, %ecx
+; X64-NEXT:    shrq $32, %r8
+; X64-NEXT:    notl %esi
+; X64-NEXT:    notl %edx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl %r8d, %edx
+; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl %edi, %esi
+; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    retq
+  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = and <4 x i32> %not, %arg1
+  %bc = bitcast <4 x i32> %res to <4 x float>
+  ret <4 x float> %bc
+}
+
+define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpeqps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpeqps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpeqss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpeqss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpleps %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpge_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpleps %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 2)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpless %xmm0, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpge_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpless %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
+  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltps %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltps %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltss %xmm0, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
+  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpleps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmple_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpleps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 2)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpless %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmple_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpless %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpneqps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpneq_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpneqps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpneqss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpneq_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpneqss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnleps %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnge_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnleps %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 6)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnless %xmm0, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnge_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnless %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
+  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltps %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpngt_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltps %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 5)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltss %xmm0, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpngt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltss %xmm0, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
+  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnleps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnle_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnleps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 6)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnless %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnle_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnless %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnlt_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 5)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnlt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpordps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpord_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpordps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpordss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpord_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpordss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cmpunordps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpunord_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cmpunordps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 3)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cmpunordss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpunord_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cmpunordss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
+  ret <4 x float> %res
+}
+
+define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comieq_ss:
+; X32:       # BB#0:
+; X32-NEXT:    comiss %xmm1, %xmm0
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    andb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comieq_ss:
+; X64:       # BB#0:
+; X64-NEXT:    comiss %xmm1, %xmm0
+; X64-NEXT:    setnp %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comige_ss:
+; X32:       # BB#0:
+; X32-NEXT:    comiss %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comige_ss:
+; X64:       # BB#0:
+; X64-NEXT:    comiss %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comigt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    comiss %xmm1, %xmm0
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comigt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    comiss %xmm1, %xmm0
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comile_ss:
+; X32:       # BB#0:
+; X32-NEXT:    comiss %xmm0, %xmm1
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comile_ss:
+; X64:       # BB#0:
+; X64-NEXT:    comiss %xmm0, %xmm1
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comilt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    comiss %xmm0, %xmm1
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comilt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    comiss %xmm0, %xmm1
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comineq_ss:
+; X32:       # BB#0:
+; X32-NEXT:    comiss %xmm1, %xmm0
+; X32-NEXT:    setp %al
+; X32-NEXT:    setne %cl
+; X32-NEXT:    orb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comineq_ss:
+; X64:       # BB#0:
+; X64-NEXT:    comiss %xmm1, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    setne %cl
+; X64-NEXT:    orb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvt_ss2si:
+; X32:       # BB#0:
+; X32-NEXT:    cvtss2si %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvt_ss2si:
+; X64:       # BB#0:
+; X64-NEXT:    cvtss2si %xmm0, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm_cvtsi32_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    cvtsi2ssl %eax, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsi32_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsi2ssl %edi, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %cvt = sitofp i32 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtss_f32:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtss_f32:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %res = extractelement <4 x float> %a0, i32 0
+  ret float %res
+}
+
+define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtss_si32:
+; X32:       # BB#0:
+; X32-NEXT:    cvtss2si %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtss_si32:
+; X64:       # BB#0:
+; X64-NEXT:    cvtss2si %xmm0, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+  ret i32 %res
+}
+
+define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttss_si:
+; X32:       # BB#0:
+; X32-NEXT:    cvttss2si %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvttss_si:
+; X64:       # BB#0:
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    retq
+  %cvt = extractelement <4 x float> %a0, i32 0
+  %res = fptosi float %cvt to i32
+  ret i32 %res
+}
+
+define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttss_si32:
+; X32:       # BB#0:
+; X32-NEXT:    cvttss2si %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvttss_si32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    retq
+  %cvt = extractelement <4 x float> %a0, i32 0
+  %res = fptosi float %cvt to i32
+  ret i32 %res
+}
+
+define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_div_ps:
+; X32:       # BB#0:
+; X32-NEXT:    divps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_div_ps:
+; X64:       # BB#0:
+; X64-NEXT:    divps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fdiv <4 x float> %a0, %a1
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_div_ss:
+; X32:       # BB#0:
+; X32-NEXT:    divss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_div_ss:
+; X64:       # BB#0:
+; X64-NEXT:    divss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <4 x float> %a0, i32 0
+  %ext1 = extractelement <4 x float> %a1, i32 0
+  %fdiv = fdiv float %ext0, %ext1
+  %res = insertelement <4 x float> %a0, float %fdiv, i32 0
+  ret <4 x float> %res
+}
+
+define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
+; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    leal (%esp), %eax
+; X32-NEXT:    stmxcsr (%eax)
+; X32-NEXT:    movl (%esp), %eax
+; X32-NEXT:    andl $8064, %eax # imm = 0x1F80
+; X32-NEXT:    popl %ecx
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $8064, %eax # imm = 0x1F80
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1, align 4
+  %4 = and i32 %3, 8064
+  ret i32 %4
+}
+declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
+
+define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
+; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    leal (%esp), %eax
+; X32-NEXT:    stmxcsr (%eax)
+; X32-NEXT:    movl (%esp), %eax
+; X32-NEXT:    andl $63, %eax
+; X32-NEXT:    popl %ecx
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $63, %eax
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1, align 4
+  %4 = and i32 %3, 63
+  ret i32 %4
+}
+
+define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
+; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    leal (%esp), %eax
+; X32-NEXT:    stmxcsr (%eax)
+; X32-NEXT:    movl (%esp), %eax
+; X32-NEXT:    andl $32768, %eax # imm = 0x8000
+; X32-NEXT:    popl %ecx
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1, align 4
+  %4 = and i32 %3, 32768
+  ret i32 %4
+}
+
+define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
+; X32-LABEL: test_MM_GET_ROUNDING_MODE:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    leal (%esp), %eax
+; X32-NEXT:    stmxcsr (%eax)
+; X32-NEXT:    movl (%esp), %eax
+; X32-NEXT:    andl $24576, %eax # imm = 0x6000
+; X32-NEXT:    popl %ecx
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_GET_ROUNDING_MODE:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $24576, %eax # imm = 0x6000
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1, align 4
+  %4 = and i32 %3, 24576
+  ret i32 %4
+}
+
+define i32 @test_mm_getcsr() nounwind {
+; X32-LABEL: test_mm_getcsr:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    leal (%esp), %eax
+; X32-NEXT:    stmxcsr (%eax)
+; X32-NEXT:    movl (%esp), %eax
+; X32-NEXT:    popl %ecx
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_getcsr:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1, align 4
+  ret i32 %3
+}
+
+define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  %res = load <4 x float>, <4 x float>* %arg0, align 16
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ps1:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load_ps1:
+; X64:       # BB#0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+  %ld = load float, float* %a0, align 4
+  %res0 = insertelement <4 x float> undef, float %ld, i32 0
+  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
+  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
+  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
+  ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load_ss:
+; X64:       # BB#0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    retq
+  %ld = load float, float* %a0, align 1
+  %res0 = insertelement <4 x float> undef, float %ld, i32 0
+  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+  %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
+  %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
+  ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_load1_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load1_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+  %ld = load float, float* %a0, align 4
+  %res0 = insertelement <4 x float> undef, float %ld, i32 0
+  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
+  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
+  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
+  ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
+; X32-LABEL: test_mm_loadh_pi:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadh_pi:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
+  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
+  %ld  = load <2 x float>, <2 x float>* %ptr
+  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
+; X32-LABEL: test_mm_loadl_pi:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadl_pi:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
+  %ld  = load <2 x float>, <2 x float>* %ptr
+  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_loadr_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps (%eax), %xmm0
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadr_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  %ld = load <4 x float>, <4 x float>* %arg0, align 16
+  %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadu_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  %res = load <4 x float>, <4 x float>* %arg0, align 1
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_max_ps:
+; X32:       # BB#0:
+; X32-NEXT:    maxps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_max_ps:
+; X64:       # BB#0:
+; X64-NEXT:    maxps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_max_ss:
+; X32:       # BB#0:
+; X32-NEXT:    maxss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_max_ss:
+; X64:       # BB#0:
+; X64-NEXT:    maxss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_min_ps:
+; X32:       # BB#0:
+; X32-NEXT:    minps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_min_ps:
+; X64:       # BB#0:
+; X64-NEXT:    minps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_min_ss:
+; X32:       # BB#0:
+; X32-NEXT:    minss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_min_ss:
+; X64:       # BB#0:
+; X64-NEXT:    minss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_move_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_move_ss:
+; X64:       # BB#0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_movehl_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movehl_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_movelh_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movelh_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %res
+}
+
+define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movmskps %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movemask_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_mul_ps:
+; X32:       # BB#0:
+; X32-NEXT:    mulps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mul_ps:
+; X64:       # BB#0:
+; X64-NEXT:    mulps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fmul <4 x float> %a0, %a1
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_mul_ss:
+; X32:       # BB#0:
+; X32-NEXT:    mulss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mul_ss:
+; X64:       # BB#0:
+; X64-NEXT:    mulss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <4 x float> %a0, i32 0
+  %ext1 = extractelement <4 x float> %a1, i32 0
+  %fmul = fmul float %ext0, %ext1
+  %res = insertelement <4 x float> %a0, float %fmul, i32 0
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_or_ps:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    orl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_or_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    orl %eax, %edx
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    orl %r8d, %ecx
+; X64-NEXT:    shrq $32, %r8
+; X64-NEXT:    shrq $32, %rsi
+; X64-NEXT:    shrq $32, %rdi
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    orl %r8d, %edi
+; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    retq
+  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+  %res = or <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <4 x float>
+  ret <4 x float> %bc
+}
+
+define void @test_mm_prefetch(i8* %a0) {
+; X32-LABEL: test_mm_prefetch:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    prefetchnta (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_prefetch:
+; X64:       # BB#0:
+; X64-NEXT:    prefetchnta (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
+  ret void
+}
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
+
+define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_rcp_ps:
+; X32:       # BB#0:
+; X32-NEXT:    rcpps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_rcp_ps:
+; X64:       # BB#0:
+; X64-NEXT:    rcpps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_rcp_ss:
+; X32:       # BB#0:
+; X32-NEXT:    rcpss %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_rcp_ss:
+; X64:       # BB#0:
+; X64-NEXT:    rcpss %xmm0, %xmm0
+; X64-NEXT:    retq
+  %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
+  %ext0 = extractelement <4 x float> %rcp, i32 0
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ext1 = extractelement <4 x float> %a0, i32 1
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ext2 = extractelement <4 x float> %a0, i32 2
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ext3 = extractelement <4 x float> %a0, i32 3
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_rsqrt_ps:
+; X32:       # BB#0:
+; X32-NEXT:    rsqrtps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_rsqrt_ps:
+; X64:       # BB#0:
+; X64-NEXT:    rsqrtps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_rsqrt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    rsqrtss %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_rsqrt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    rsqrtss %xmm0, %xmm0
+; X64-NEXT:    retq
+  %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
+  %ext0 = extractelement <4 x float> %rsqrt, i32 0
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ext1 = extractelement <4 x float> %a0, i32 1
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ext2 = extractelement <4 x float> %a0, i32 2
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ext3 = extractelement <4 x float> %a0, i32 3
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    leal (%esp), %ecx
+; X32-NEXT:    stmxcsr (%ecx)
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    andl $-8065, %edx # imm = 0xFFFFFFFFFFFFE07F
+; X32-NEXT:    orl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    ldmxcsr (%ecx)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $-8065, %ecx # imm = 0xFFFFFFFFFFFFE07F
+; X64-NEXT:    orl %edi, %ecx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr (%rax)
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1
+  %4 = and i32 %3, -8065
+  %5 = or i32 %4, %a0
+  store i32 %5, i32* %1
+  call void @llvm.x86.sse.ldmxcsr(i8* %2)
+  ret void
+}
+declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
+
+define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    leal (%esp), %ecx
+; X32-NEXT:    stmxcsr (%ecx)
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    andl $-64, %edx
+; X32-NEXT:    orl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    ldmxcsr (%ecx)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $-64, %ecx
+; X64-NEXT:    orl %edi, %ecx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr (%rax)
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1
+  %4 = and i32 %3, -64
+  %5 = or i32 %4, %a0
+  store i32 %5, i32* %1
+  call void @llvm.x86.sse.ldmxcsr(i8* %2)
+  ret void
+}
+
+define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    leal (%esp), %ecx
+; X32-NEXT:    stmxcsr (%ecx)
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    andl $-32769, %edx # imm = 0xFFFFFFFFFFFF7FFF
+; X32-NEXT:    orl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    ldmxcsr (%ecx)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $-32769, %ecx # imm = 0xFFFFFFFFFFFF7FFF
+; X64-NEXT:    orl %edi, %ecx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr (%rax)
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1
+  %4 = and i32 %3, -32769
+  %5 = or i32 %4, %a0
+  store i32 %5, i32* %1
+  call void @llvm.x86.sse.ldmxcsr(i8* %2)
+  ret void
+}
+
+define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
+; X32-LABEL: test_mm_set_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_set_ps:
+; X64:       # BB#0:
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movaps %xmm3, %xmm0
+; X64-NEXT:    retq
+  %res0  = insertelement <4 x float> undef, float %a3, i32 0
+  %res1  = insertelement <4 x float> %res0, float %a2, i32 1
+  %res2  = insertelement <4 x float> %res1, float %a1, i32 2
+  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
+  ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
+; X32-LABEL: test_mm_set_ps1:
+; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_set_ps1:
+; X64:       # BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+  %res0  = insertelement <4 x float> undef, float %a0, i32 0
+  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
+  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
+  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
+  ret <4 x float> %res3
+}
+
+define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_ROUNDING_MODE:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    leal (%esp), %ecx
+; X32-NEXT:    stmxcsr (%ecx)
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    andl $-24577, %edx # imm = 0xFFFFFFFFFFFF9FFF
+; X32-NEXT:    orl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    ldmxcsr (%ecx)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_SET_ROUNDING_MODE:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    stmxcsr (%rax)
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $-24577, %ecx # imm = 0xFFFFFFFFFFFF9FFF
+; X64-NEXT:    orl %edi, %ecx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr (%rax)
+; X64-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %2)
+  %3 = load i32, i32* %1
+  %4 = and i32 %3, -24577
+  %5 = or i32 %4, %a0
+  store i32 %5, i32* %1
+  call void @llvm.x86.sse.ldmxcsr(i8* %2)
+  ret void
+}
+
+define <4 x float> @test_mm_set_ss(float %a0) nounwind {
+; X32-LABEL: test_mm_set_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_set_ss:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res0  = insertelement <4 x float> undef, float %a0, i32 0
+  %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
+  %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
+  %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
+  ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
+; X32-LABEL: test_mm_set1_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_set1_ps:
+; X64:       # BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+  %res0  = insertelement <4 x float> undef, float %a0, i32 0
+  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
+  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
+  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
+  ret <4 x float> %res3
+}
+
+define void @test_mm_setcsr(i32 %a0) nounwind {
+; X32-LABEL: test_mm_setcsr:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    leal (%esp), %ecx
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    ldmxcsr (%ecx)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_setcsr:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr (%rax)
+; X64-NEXT:    retq
+  %st = alloca i32, align 4
+  store i32 %a0, i32* %st, align 4
+  %bc = bitcast i32* %st to i8*
+  call void @llvm.x86.sse.ldmxcsr(i8* %bc)
+  ret void
+}
+
+define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
+; X32-LABEL: test_mm_setr_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_setr_ps:
+; X64:       # BB#0:
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %res0  = insertelement <4 x float> undef, float %a0, i32 0
+  %res1  = insertelement <4 x float> %res0, float %a1, i32 1
+  %res2  = insertelement <4 x float> %res1, float %a2, i32 2
+  %res3  = insertelement <4 x float> %res2, float %a3, i32 3
+  ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_setzero_ps() {
+; X32-LABEL: test_mm_setzero_ps:
+; X32:       # BB#0:
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_setzero_ps:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    retq
+  ret <4 x float> zeroinitializer
+}
+
+define void @test_mm_sfence() nounwind {
+; X32-LABEL: test_mm_sfence:
+; X32:       # BB#0:
+; X32-NEXT:    sfence
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sfence:
+; X64:       # BB#0:
+; X64-NEXT:    sfence
+; X64-NEXT:    retq
+  call void @llvm.x86.sse.sfence()
+  ret void
+}
+declare void @llvm.x86.sse.sfence() nounwind readnone
+
+define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_shuffle_ps:
+; X32:       # BB#0:
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shuffle_ps:
+; X64:       # BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_sqrt_ps:
+; X32:       # BB#0:
+; X32-NEXT:    sqrtps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sqrt_ps:
+; X64:       # BB#0:
+; X64-NEXT:    sqrtps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_sqrt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    sqrtss %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sqrt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    sqrtss %xmm0, %xmm0
+; X64-NEXT:    retq
+  %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
+  %ext0 = extractelement <4 x float> %sqrt, i32 0
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ext1 = extractelement <4 x float> %a0, i32 1
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ext2 = extractelement <4 x float> %a0, i32 2
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ext3 = extractelement <4 x float> %a0, i32 3
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  store <4 x float> %a1, <4 x float>* %arg0, align 16
+  ret void
+}
+
+define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ps1:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    movaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store_ps1:
+; X64:       # BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %shuf, <4 x float>* %arg0, align 16
+  ret void
+}
+
+define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store_ss:
+; X64:       # BB#0:
+; X64-NEXT:    movss %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  store float %ext, float* %a0, align 1
+  ret void
+}
+
+define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store1_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    movaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store1_ps:
+; X64:       # BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %shuf, <4 x float>* %arg0, align 16
+  ret void
+}
+
+define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_storeh_ps:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movaps %xmm0, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, 4(%eax)
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storeh_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    retq
+  %ptr = bitcast x86_mmx* %a0 to i64*
+  %bc  = bitcast <4 x float> %a1 to <2 x i64>
+  %ext = extractelement <2 x i64> %bc, i32 1
+  store i64 %ext, i64* %ptr
+  ret void
+}
+
+define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_storel_ps:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movaps %xmm0, (%esp)
+; X32-NEXT:    movl (%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, 4(%eax)
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storel_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    retq
+  %ptr = bitcast x86_mmx* %a0 to i64*
+  %bc  = bitcast <4 x float> %a1 to <2 x i64>
+  %ext = extractelement <2 x i64> %bc, i32 0
+  store i64 %ext, i64* %ptr
+  ret void
+}
+
+define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_storer_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT:    movaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storer_ps:
+; X64:       # BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x float> %shuf, <4 x float>* %arg0, align 16
+  ret void
+}
+
+define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_storeu_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storeu_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movups %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  store <4 x float> %a1, <4 x float>* %arg0, align 1
+  ret void
+}
+
+define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_stream_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movntps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast float* %a0 to <4 x float>*
+  store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
+  ret void
+}
+
+define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_sub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    subps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    subps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fsub <4 x float> %a0, %a1
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_sub_ss:
+; X32:       # BB#0:
+; X32-NEXT:    subss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_ss:
+; X64:       # BB#0:
+; X64-NEXT:    subss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <4 x float> %a0, i32 0
+  %ext1 = extractelement <4 x float> %a1, i32 0
+  %fsub = fsub float %ext0, %ext1
+  %res = insertelement <4 x float> %a0, float %fsub, i32 0
+  ret <4 x float> %res
+}
+
+define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
+; X32-LABEL: test_MM_TRANSPOSE4_PS:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movaps (%esi), %xmm0
+; X32-NEXT:    movaps (%edx), %xmm1
+; X32-NEXT:    movaps (%ecx), %xmm2
+; X32-NEXT:    movaps (%eax), %xmm3
+; X32-NEXT:    movaps %xmm0, %xmm4
+; X32-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X32-NEXT:    movaps %xmm2, %xmm5
+; X32-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X32-NEXT:    movaps %xmm4, %xmm1
+; X32-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X32-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X32-NEXT:    movaps %xmm0, %xmm3
+; X32-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X32-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X32-NEXT:    movaps %xmm1, (%esi)
+; X32-NEXT:    movaps %xmm5, (%edx)
+; X32-NEXT:    movaps %xmm3, (%ecx)
+; X32-NEXT:    movaps %xmm2, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_MM_TRANSPOSE4_PS:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    movaps (%rsi), %xmm1
+; X64-NEXT:    movaps (%rdx), %xmm2
+; X64-NEXT:    movaps (%rcx), %xmm3
+; X64-NEXT:    movaps %xmm0, %xmm4
+; X64-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X64-NEXT:    movaps %xmm2, %xmm5
+; X64-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT:    movaps %xmm4, %xmm1
+; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X64-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X64-NEXT:    movaps %xmm0, %xmm3
+; X64-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X64-NEXT:    movaps %xmm1, (%rdi)
+; X64-NEXT:    movaps %xmm5, (%rsi)
+; X64-NEXT:    movaps %xmm3, (%rdx)
+; X64-NEXT:    movaps %xmm2, (%rcx)
+; X64-NEXT:    retq
+  %row0 = load <4 x float>, <4 x float>* %a0, align 16
+  %row1 = load <4 x float>, <4 x float>* %a1, align 16
+  %row2 = load <4 x float>, <4 x float>* %a2, align 16
+  %row3 = load <4 x float>, <4 x float>* %a3, align 16
+  %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  store <4 x float> %res0, <4 x float>* %a0, align 16
+  store <4 x float> %res1, <4 x float>* %a1, align 16
+  store <4 x float> %res2, <4 x float>* %a2, align 16
+  store <4 x float> %res3, <4 x float>* %a3, align 16
+  ret void
+}
+
+define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomieq_ss:
+; X32:       # BB#0:
+; X32-NEXT:    ucomiss %xmm1, %xmm0
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    andb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomieq_ss:
+; X64:       # BB#0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setnp %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomige_ss:
+; X32:       # BB#0:
+; X32-NEXT:    ucomiss %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomige_ss:
+; X64:       # BB#0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomigt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    ucomiss %xmm1, %xmm0
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomigt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomile_ss:
+; X32:       # BB#0:
+; X32-NEXT:    ucomiss %xmm0, %xmm1
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomile_ss:
+; X64:       # BB#0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomilt_ss:
+; X32:       # BB#0:
+; X32-NEXT:    ucomiss %xmm0, %xmm1
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomilt_ss:
+; X64:       # BB#0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomineq_ss:
+; X32:       # BB#0:
+; X32-NEXT:    ucomiss %xmm1, %xmm0
+; X32-NEXT:    setp %al
+; X32-NEXT:    setne %cl
+; X32-NEXT:    orb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomineq_ss:
+; X64:       # BB#0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    setne %cl
+; X64-NEXT:    orb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_undefined_ps() {
+; X32-LABEL: test_mm_undefined_ps:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_undefined_ps:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  ret <4 x float> undef
+}
+
+define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_unpackhi_ps:
+; X32:       # BB#0:
+; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpackhi_ps:
+; X64:       # BB#0:
+; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_unpacklo_ps:
+; X32:       # BB#0:
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpacklo_ps:
+; X64:       # BB#0:
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_xor_ps:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $64, %esp
+; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_xor_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    xorl %eax, %edx
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    xorl %r8d, %ecx
+; X64-NEXT:    shrq $32, %r8
+; X64-NEXT:    shrq $32, %rsi
+; X64-NEXT:    shrq $32, %rdi
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %r8d, %edi
+; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %eax, %esi
+; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    retq
+  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+  %res = xor <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <4 x float>
+  ret <4 x float> %bc
+}
+
+!0 = !{i32 1}