[llvm] r269966 - [X86][SSE2] Added fast-isel tests to sync with clang/test/CodeGen/sse2-builtins.c

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed May 18 11:00:45 PDT 2016


Author: rksimon
Date: Wed May 18 13:00:43 2016
New Revision: 269966

URL: http://llvm.org/viewvc/llvm-project?rev=269966&view=rev
Log:
[X86][SSE2] Added fast-isel tests to sync with clang/test/CodeGen/sse2-builtins.c

Added:
    llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
    llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll

Added: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll?rev=269966&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll Wed May 18 13:00:43 2016
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define i64 @test_mm_cvtsd_si64(<2 x double> %a0) nounwind {
+; X64-LABEL: test_mm_cvtsd_si64:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsd2si %xmm0, %rax
+; X64-NEXT:    retq
+  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
+; X64-LABEL: test_mm_cvtsi128_si64:
+; X64:       # BB#0:
+; X64-NEXT:    movd %xmm0, %rax
+; X64-NEXT:    retq
+  %res = extractelement <2 x i64> %a0, i32 0
+  ret i64 %res
+}
+
+define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
+; X64-LABEL: test_mm_cvtsi64_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsi2sdq %rdi, %xmm1
+; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    retq
+  %cvt = sitofp i64 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
+; X64-LABEL: test_mm_cvtsi64_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movd %rdi, %xmm0
+; X64-NEXT:    retq
+  %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+  %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+  ret <2 x i64> %res1
+}
+
+define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
+; X64-LABEL: test_mm_cvttsd_si64:
+; X64:       # BB#0:
+; X64-NEXT:    cvttsd2si %xmm0, %rax
+; X64-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %res = fptosi double %ext to i64
+  ret i64 %res
+}
+
+define void @test_mm_stream_si64(i64 *%a0, i64 %a1) {
+; X64-LABEL: test_mm_stream_si64:
+; X64:       # BB#0:
+; X64-NEXT:    movntiq %rsi, (%rdi)
+; X64-NEXT:    retq
+  store i64 %a1, i64* %a0, align 1, !nontemporal !0
+  ret void
+}
+
+!0 = !{i64 1}

Added: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll?rev=269966&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll Wed May 18 13:00:43 2016
@@ -0,0 +1,3051 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    paddb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    paddb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = add <16 x i8> %arg0, %arg1
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    paddw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    paddw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = add <8 x i16> %arg0, %arg1
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    paddd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    paddd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = add <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    paddq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    paddq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = add <2 x i64> %a0, %a1
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_add_pd:
+; X32:       # BB#0:
+; X32-NEXT:    addpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_pd:
+; X64:       # BB#0:
+; X64-NEXT:    addpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fadd <2 x double> %a0, %a1
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_add_sd:
+; X32:       # BB#0:
+; X32-NEXT:    addsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_add_sd:
+; X64:       # BB#0:
+; X64-NEXT:    addsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <2 x double> %a0, i32 0
+  %ext1 = extractelement <2 x double> %a1, i32 0
+  %fadd = fadd double %ext0, %ext1
+  %res = insertelement <2 x double> %a0, double %fadd, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    paddsb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_adds_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    paddsb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    paddsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_adds_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    paddsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epu8:
+; X32:       # BB#0:
+; X32-NEXT:    paddusb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_adds_epu8:
+; X64:       # BB#0:
+; X64-NEXT:    paddusb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epu16:
+; X32:       # BB#0:
+; X32-NEXT:    paddusw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_adds_epu16:
+; X64:       # BB#0:
+; X64-NEXT:    paddusw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_and_pd:
+; X32:       # BB#0:
+; X32-NEXT:    andps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_and_pd:
+; X64:       # BB#0:
+; X64-NEXT:    andps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+  %res = and <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <2 x double>
+  ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_and_si128:
+; X32:       # BB#0:
+; X32-NEXT:    andps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_and_si128:
+; X64:       # BB#0:
+; X64-NEXT:    andps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = and <2 x i64> %a0, %a1
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_pd:
+; X32:       # BB#0:
+; X32-NEXT:    andnps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_andnot_pd:
+; X64:       # BB#0:
+; X64-NEXT:    andnps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = and <4 x i32> %not, %arg1
+  %bc = bitcast <4 x i32> %res to <2 x double>
+  ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_si128:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpeqd %xmm2, %xmm2
+; X32-NEXT:    pxor %xmm2, %xmm0
+; X32-NEXT:    pand %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_andnot_si128:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpeqd %xmm2, %xmm2
+; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+  %not = xor <2 x i64> %a0, <i64 -1, i64 -1>
+  %res = and <2 x i64> %not, %a1
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_avg_epu8:
+; X32:       # BB#0:
+; X32-NEXT:    pavgb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_avg_epu8:
+; X64:       # BB#0:
+; X64-NEXT:    pavgb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
+
+define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_avg_epu16:
+; X32:       # BB#0:
+; X32-NEXT:    pavgw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_avg_epu16:
+; X64:       # BB#0:
+; X64-NEXT:    pavgw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_bslli_si128:
+; X32:       # BB#0:
+; X32-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_bslli_si128:
+; X64:       # BB#0:
+; X64-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_bsrli_si128:
+; X32:       # BB#0:
+; X32-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_bsrli_si128:
+; X64:       # BB#0:
+; X64-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define void @test_mm_clflush(i8* %a0) nounwind {
+; X32-LABEL: test_mm_clflush:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    clflush (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_clflush:
+; X64:       # BB#0:
+; X64-NEXT:    clflush (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse2.clflush(i8* %a0)
+  ret void
+}
+declare void @llvm.x86.sse2.clflush(i8*) nounwind readnone
+
+define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpeqb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %cmp = icmp eq <16 x i8> %arg0, %arg1
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpeqw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpeqw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %cmp = icmp eq <8 x i16> %arg0, %arg1
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpeqd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %cmp = icmp eq <4 x i32> %arg0, %arg1
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpeqpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpeqpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mm_cmpeq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpeqsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpeq_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpeqsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmplepd %xmm0, %xmm1
+; X32-NEXT:    movapd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpge_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmplepd %xmm0, %xmm1
+; X64-NEXT:    movapd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a1, <2 x double> %a0, i8 2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmplesd %xmm0, %xmm1
+; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpge_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmplesd %xmm0, %xmm1
+; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    retq
+  %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
+  %ext0 = extractelement <2 x double> %cmp, i32 0
+  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+  %ext1 = extractelement <2 x double> %a0, i32 1
+  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+  ret <2 x double> %ins1
+}
+
+define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpgtb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpgtb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %cmp = icmp sgt <16 x i8> %arg0, %arg1
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpgtw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpgtw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %cmp = icmp sgt <8 x i16> %arg0, %arg1
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpgtd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %cmp = icmp sgt <4 x i32> %arg0, %arg1
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltpd %xmm0, %xmm1
+; X32-NEXT:    movapd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltpd %xmm0, %xmm1
+; X64-NEXT:    movapd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a1, <2 x double> %a0, i8 1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltsd %xmm0, %xmm1
+; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpgt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltsd %xmm0, %xmm1
+; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    retq
+  %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
+  %ext0 = extractelement <2 x double> %cmp, i32 0
+  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+  %ext1 = extractelement <2 x double> %a0, i32 1
+  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+  ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmplepd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmple_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmplepd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmplesd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmple_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmplesd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 2)
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpgtb %xmm0, %xmm1
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %cmp = icmp sgt <16 x i8> %arg1, %arg0
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpgtw %xmm0, %xmm1
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %cmp = icmp sgt <8 x i16> %arg1, %arg0
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpgtd %xmm0, %xmm1
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %cmp = icmp sgt <4 x i32> %arg1, %arg0
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpltsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmplt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpltsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpneqpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpneq_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpneqpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpneqsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpneq_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpneqsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnlepd %xmm0, %xmm1
+; X32-NEXT:    movapd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnge_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnlepd %xmm0, %xmm1
+; X64-NEXT:    movapd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a1, <2 x double> %a0, i8 6)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnlesd %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnge_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnlesd %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltpd %xmm0, %xmm1
+; X32-NEXT:    movapd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpngt_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltpd %xmm0, %xmm1
+; X64-NEXT:    movapd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a1, <2 x double> %a0, i8 5)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltsd %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpngt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltsd %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnlepd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnle_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnlepd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 6)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnlesd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnle_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnlesd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 6)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnlt_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 5)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpnltsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpnlt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpnltsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 5)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpordpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpord_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpordpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpordsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpord_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpordsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpunordpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpunord_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpunordpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 3)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cmpunordsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cmpunord_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cmpunordsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 3)
+  ret <2 x double> %res
+}
+
+define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comieq_sd:
+; X32:       # BB#0:
+; X32-NEXT:    comisd %xmm1, %xmm0
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    andb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comieq_sd:
+; X64:       # BB#0:
+; X64-NEXT:    comisd %xmm1, %xmm0
+; X64-NEXT:    setnp %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comige_sd:
+; X32:       # BB#0:
+; X32-NEXT:    comisd %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comige_sd:
+; X64:       # BB#0:
+; X64-NEXT:    comisd %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comigt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    comisd %xmm1, %xmm0
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comigt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    comisd %xmm1, %xmm0
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comile_sd:
+; X32:       # BB#0:
+; X32-NEXT:    comisd %xmm0, %xmm1
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comile_sd:
+; X64:       # BB#0:
+; X64-NEXT:    comisd %xmm0, %xmm1
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comilt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    comisd %xmm0, %xmm1
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comilt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    comisd %xmm0, %xmm1
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comineq_sd:
+; X32:       # BB#0:
+; X32-NEXT:    comisd %xmm1, %xmm0
+; X32-NEXT:    setp %al
+; X32-NEXT:    setne %cl
+; X32-NEXT:    orb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_comineq_sd:
+; X64:       # BB#0:
+; X64-NEXT:    comisd %xmm1, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    setne %cl
+; X64-NEXT:    orb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtepi32_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cvtdq2pd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtepi32_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %arg0)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+define <4 x float> @test_mm_cvtepi32_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtepi32_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtepi32_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %arg0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_cvtpd_epi32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtpd_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    cvtpd2dq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtpd_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    cvtpd2dq %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtpd_ps(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtpd_ps:
+; X32:       # BB#0:
+; X32-NEXT:    cvtpd2ps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtpd_ps:
+; X64:       # BB#0:
+; X64-NEXT:    cvtpd2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_cvtps_epi32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    cvtps2dq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtps_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    cvtps2dq %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_pd:
+; X32:       # BB#0:
+; X32-NEXT:    cvtps2pd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtps_pd:
+; X64:       # BB#0:
+; X64-NEXT:    cvtps2pd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsd_f64:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    movlps %xmm0, (%esp)
+; X32-NEXT:    fldl (%esp)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsd_f64:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %res = extractelement <2 x double> %a0, i32 0
+  ret double %res
+}
+
+define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsd_si32:
+; X32:       # BB#0:
+; X32-NEXT:    cvtsd2si %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsd_si32:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsd2si %xmm0, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsi128_si32:
+; X32:       # BB#0:
+; X32-NEXT:    movd %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsi128_si32:
+; X64:       # BB#0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = extractelement <4 x i32> %arg0, i32 0
+  ret i32 %res
+}
+
+define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm_cvtsi32_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    cvtsi2sdl %eax, %xmm1
+; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsi32_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsi2sdl %edi, %xmm1
+; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    retq
+  %cvt = sitofp i32 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
+; X32-LABEL: test_mm_cvtsi32_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsi32_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    retq
+  %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 0, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 0, i32 3
+  %res = bitcast <4 x i32> %res3 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cvtss_sd:
+; X32:       # BB#0:
+; X32-NEXT:    cvtss2sd %xmm1, %xmm1
+; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtss_sd:
+; X64:       # BB#0:
+; X64-NEXT:    cvtss2sd %xmm1, %xmm1
+; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvttpd_epi32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvttpd_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    cvttpd2dq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvttpd_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttpd2dq %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttps_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    cvttps2dq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvttps_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttps2dq %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvttsd_si32:
+; X32:       # BB#0:
+; X32-NEXT:    cvttsd2si %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvttsd_si32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %res = fptosi double %ext to i32
+  ret i32 %res
+}
+
+define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_div_pd:
+; X32:       # BB#0:
+; X32-NEXT:    divpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_div_pd:
+; X64:       # BB#0:
+; X64-NEXT:    divpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fdiv <2 x double> %a0, %a1
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_div_sd:
+; X32:       # BB#0:
+; X32-NEXT:    divsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_div_sd:
+; X64:       # BB#0:
+; X64-NEXT:    divsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <2 x double> %a0, i32 0
+  %ext1 = extractelement <2 x double> %a1, i32 0
+  %fdiv = fdiv double %ext0, %ext1
+  %res = insertelement <2 x double> %a0, double %fdiv, i32 0
+  ret <2 x double> %res
+}
+
+define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_extract_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pextrw $1, %xmm0, %eax
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extract_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pextrw $1, %xmm0, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %ext = extractelement <8 x i16> %arg0, i32 1
+  %res = zext i16 %ext to i32
+  ret i32 %res
+}
+
+define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
+; X32-LABEL: test_mm_insert_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT:    pinsrw $1, %eax, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_insert_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pinsrw $1, %edi, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %res = insertelement <8 x i16> %arg0, i16 %a1,i32 1
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define void @test_mm_lfence() nounwind {
+; X32-LABEL: test_mm_lfence:
+; X32:       # BB#0:
+; X32-NEXT:    lfence
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_lfence:
+; X64:       # BB#0:
+; X64-NEXT:    lfence
+; X64-NEXT:    retq
+  call void @llvm.x86.sse2.lfence()
+  ret void
+}
+declare void @llvm.x86.sse2.lfence() nounwind readnone
+
+define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  %res = load <2 x double>, <2 x double>* %arg0, align 16
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    retq
+  %ld = load double, double* %a0, align 1
+  %res0 = insertelement <2 x double> undef, double %ld, i32 0
+  %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
+  ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm_load_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    retq
+  %res = load <2 x i64>, <2 x i64>* %a0, align 16
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load1_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_load1_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    retq
+  %ld = load double, double* %a0, align 8
+  %res0 = insertelement <2 x double> undef, double %ld, i32 0
+  %res1 = insertelement <2 x double> %res0, double %ld, i32 1
+  ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm_loadh_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadh_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-NEXT:    retq
+  %ld = load double, double* %a1, align 8
+  %res = insertelement <2 x double> %a0, double %ld, i32 1
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
+; X32-LABEL: test_mm_loadl_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadl_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    retq
+  %bc = bitcast <2 x i64>* %a1 to i64*
+  %ld = load i64, i64* %bc, align 1
+  %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
+  %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+  ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm_loadl_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadl_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X64-NEXT:    retq
+  %ld = load double, double* %a1, align 8
+  %res = insertelement <2 x double> %a0, double %ld, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_loadr_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movapd (%eax), %xmm0
+; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadr_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movapd (%rdi), %xmm0
+; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  %ld = load <2 x double>, <2 x double>* %arg0, align 16
+  %res = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadu_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  %res = load <2 x double>, <2 x double>* %arg0, align 1
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loadu_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    retq
+  %res = load <2 x i64>, <2 x i64>* %a0, align 1
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_madd_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_madd_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmaddwd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_madd_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmaddwd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) nounwind {
+; X32-LABEL: test_mm_maskmoveu_si128:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    maskmovdqu %xmm1, %xmm0
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskmoveu_si128:
+; X64:       # BB#0:
+; X64-NEXT:    maskmovdqu %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %arg0, <16 x i8> %arg1, i8* %a2)
+  ret void
+}
+declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
+
+define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_max_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmaxsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_max_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_max_epu8:
+; X32:       # BB#0:
+; X32-NEXT:    pmaxub %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_max_epu8:
+; X64:       # BB#0:
+; X64-NEXT:    pmaxub %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_max_pd:
+; X32:       # BB#0:
+; X32-NEXT:    maxpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_max_pd:
+; X64:       # BB#0:
+; X64-NEXT:    maxpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_max_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_max_sd:
+; X32:       # BB#0:
+; X32-NEXT:    maxsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_max_sd:
+; X64:       # BB#0:
+; X64-NEXT:    maxsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define void @test_mm_mfence() nounwind {
+; X32-LABEL: test_mm_mfence:
+; X32:       # BB#0:
+; X32-NEXT:    mfence
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mfence:
+; X64:       # BB#0:
+; X64-NEXT:    mfence
+; X64-NEXT:    retq
+  call void @llvm.x86.sse2.mfence()
+  ret void
+}
+declare void @llvm.x86.sse2.mfence() nounwind readnone
+
+define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_min_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pminsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_min_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pminsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_min_epu8:
+; X32:       # BB#0:
+; X32-NEXT:    pminub %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_min_epu8:
+; X64:       # BB#0:
+; X64-NEXT:    pminub %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_min_pd:
+; X32:       # BB#0:
+; X32-NEXT:    minpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_min_pd:
+; X64:       # BB#0:
+; X64-NEXT:    minpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_min_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_min_sd:
+; X32:       # BB#0:
+; X32-NEXT:    minsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_min_sd:
+; X64:       # BB#0:
+; X64-NEXT:    minsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_movemask_epi8(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pmovmskb %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movemask_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pmovmskb %xmm0, %eax
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %arg0)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movmskpd %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movemask_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movmskpd %xmm0, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mul_epu32:
+; X32:       # BB#0:
+; X32-NEXT:    pmuludq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mul_epu32:
+; X64:       # BB#0:
+; X64-NEXT:    pmuludq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_mul_pd:
+; X32:       # BB#0:
+; X32-NEXT:    mulpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mul_pd:
+; X64:       # BB#0:
+; X64-NEXT:    mulpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fmul <2 x double> %a0, %a1
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_mul_sd:
+; X32:       # BB#0:
+; X32-NEXT:    mulsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mul_sd:
+; X64:       # BB#0:
+; X64-NEXT:    mulsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <2 x double> %a0, i32 0
+  %ext1 = extractelement <2 x double> %a1, i32 0
+  %fmul = fmul double %ext0, %ext1
+  %res = insertelement <2 x double> %a0, double %fmul, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_mulhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhi_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmulhw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mulhi_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmulhw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mulhi_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhi_epu16:
+; X32:       # BB#0:
+; X32-NEXT:    pmulhuw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mulhi_epu16:
+; X64:       # BB#0:
+; X64-NEXT:    pmulhuw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mullo_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmullw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mullo_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmullw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = mul <8 x i16> %arg0, %arg1
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_or_pd:
+; X32:       # BB#0:
+; X32-NEXT:    orps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_or_pd:
+; X64:       # BB#0:
+; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+  %res = or <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <2 x double>
+  ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_or_si128:
+; X32:       # BB#0:
+; X32-NEXT:    orps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_or_si128:
+; X64:       # BB#0:
+; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = or <2 x i64> %a0, %a1
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_packs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    packsswb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_packs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    packsswb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packs_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    packssdw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_packs_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    packssdw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packus_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    packuswb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_packus_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    packuswb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define void @test_mm_pause() nounwind {
+; X32-LABEL: test_mm_pause:
+; X32:       # BB#0:
+; X32-NEXT:    pause
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_pause:
+; X64:       # BB#0:
+; X64-NEXT:    pause
+; X64-NEXT:    retq
+  call void @llvm.x86.sse2.pause()
+  ret void
+}
+declare void @llvm.x86.sse2.pause() nounwind readnone
+
+define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sad_epu8:
+; X32:       # BB#0:
+; X32-NEXT:    psadbw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sad_epu8:
+; X64:       # BB#0:
+; X64-NEXT:    psadbw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %arg0, <16 x i8> %arg1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @test_mm_setzero_pd() {
+; X32-LABEL: test_mm_setzero_pd:
+; X32:       # BB#0:
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_setzero_pd:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    retq
+  ret <2 x double> zeroinitializer
+}
+
+define <2 x i64> @test_mm_setzero_si128() {
+; X32-LABEL: test_mm_setzero_si128:
+; X32:       # BB#0:
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_setzero_si128:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    retq
+  ret <2 x i64> zeroinitializer
+}
+
+define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shuffle_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shuffle_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_shuffle_pd:
+; X32:       # BB#0:
+; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shuffle_pd:
+; X64:       # BB#0:
+; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shufflehi_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shufflehi_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shufflelo_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shufflelo_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sll_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psllw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sll_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psllw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sll_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pslld %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sll_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pslld %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %arg0, <4 x i32> %arg1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sll_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    psllq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sll_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    psllq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psllw $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_slli_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psllw $1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pslld $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_slli_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pslld $1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    psllq $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_slli_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    psllq $1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_slli_si128:
+; X32:       # BB#0:
+; X32-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_slli_si128:
+; X64:       # BB#0:
+; X64-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_sqrt_pd:
+; X32:       # BB#0:
+; X32-NEXT:    sqrtpd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sqrt_pd:
+; X64:       # BB#0:
+; X64-NEXT:    sqrtpd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sqrt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    sqrtsd %xmm0, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sqrt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    sqrtsd %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
+  %ext0 = extractelement <2 x double> %call, i32 0
+  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+  %ext1 = extractelement <2 x double> %a1, i32 1
+  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+  ret <2 x double> %ins1
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sra_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psraw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sra_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psraw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sra_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sra_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psrad %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sra_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psrad %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %arg0, <4 x i32> %arg1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srai_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srai_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psraw $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srai_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psraw $1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %arg0, i32 1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srai_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srai_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psrad $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srai_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psrad $1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %arg0, i32 1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psrlw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srl_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psrlw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psrld %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srl_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psrld %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %arg0, <4 x i32> %arg1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    psrlq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srl_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    psrlq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psrlw $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srli_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psrlw $1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %arg0, i32 1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psrld $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srli_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psrld $1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %arg0, i32 1)
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    psrlq $1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srli_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    psrlq $1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_srli_si128:
+; X32:       # BB#0:
+; X32-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_srli_si128:
+; X64:       # BB#0:
+; X64-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  store <2 x double> %a1, <2 x double>* %arg0, align 16
+  ret void
+}
+
+define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movsd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movsd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %ext = extractelement <2 x double> %a1, i32 0
+  store double %ext, double* %a0, align 1
+  ret void
+}
+
+define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_store_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  store <2 x i64> %a1, <2 x i64>* %a0, align 16
+  ret void
+}
+
+define void @test_mm_store1_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store1_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movsd %xmm0, (%eax)
+; X32-NEXT:    movsd %xmm0, 8(%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_store1_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movsd %xmm0, (%rdi)
+; X64-NEXT:    movsd %xmm0, 8(%rdi)
+; X64-NEXT:    retq
+  %ext = extractelement <2 x double> %a1, i32 0
+  %ptr0 = getelementptr inbounds double, double* %a0, i32 0
+  %ptr1 = getelementptr inbounds double, double* %a0, i32 1
+  store double %ext, double* %ptr0, align 1
+  store double %ext, double* %ptr1, align 1
+  ret void
+}
+
+define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storeh_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT:    movsd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storeh_sd:
+; X64:       # BB#0:
+; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT:    movsd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %ext = extractelement <2 x double> %a1, i32 1
+  store double %ext, double* %a0, align 8
+  ret void
+}
+
+define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_storel_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movlps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storel_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    movd %xmm0, %rax
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    retq
+  %ext = extractelement <2 x i64> %a1, i32 0
+  %bc = bitcast <2 x i64> *%a0 to i64*
+  store i64 %ext, i64* %bc, align 8
+  ret void
+}
+
+define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storel_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movsd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storel_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movsd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %ext = extractelement <2 x double> %a1, i32 0
+  store double %ext, double* %a0, align 8
+  ret void
+}
+
+define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storer_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT:    movapd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storer_pd:
+; X64:       # BB#0:
+; X64-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT:    movapd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  store <2 x double> %shuf, <2 x double>* %arg0, align 16
+  ret void
+}
+
+define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storeu_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storeu_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movups %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  store <2 x double> %a1, <2 x double>* %arg0, align 1
+  ret void
+}
+
+define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_storeu_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_storeu_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movups %xmm0, (%rdi)
+; X64-NEXT:    retq
+  store <2 x i64> %a1, <2 x i64>* %a0, align 1
+  ret void
+}
+
+define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_stream_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movntps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %arg0 = bitcast double* %a0 to <2 x double>*
+  store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0
+  ret void
+}
+
+define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
+; X32-LABEL: test_mm_stream_si32:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movntil %eax, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_si32:
+; X64:       # BB#0:
+; X64-NEXT:    movntil %esi, (%rdi)
+; X64-NEXT:    retq
+  store i32 %a1, i32* %a0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_stream_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_si128:
+; X64:       # BB#0:
+; X64-NEXT:    movntps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
+  ret void
+}
+
+define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    psubb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    psubb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = sub <16 x i8> %arg0, %arg1
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psubw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psubw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = sub <8 x i16> %arg0, %arg1
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psubd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psubd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = sub <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    psubq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    psubq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = sub <2 x i64> %a0, %a1
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    subpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    subpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = fsub <2 x double> %a0, %a1
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sub_sd:
+; X32:       # BB#0:
+; X32-NEXT:    subsd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sub_sd:
+; X64:       # BB#0:
+; X64-NEXT:    subsd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %ext0 = extractelement <2 x double> %a0, i32 0
+  %ext1 = extractelement <2 x double> %a1, i32 0
+  %fsub = fsub double %ext0, %ext1
+  %res = insertelement <2 x double> %a0, double %fsub, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    psubsb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_subs_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    psubsb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_subs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epu8:
+; X32:       # BB#0:
+; X32-NEXT:    psubusb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_subs_epu8:
+; X64:       # BB#0:
+; X64-NEXT:    psubusb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1)
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epu16:
+; X32:       # BB#0:
+; X32-NEXT:    psubusw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_subs_epu16:
+; X64:       # BB#0:
+; X64-NEXT:    psubusw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1)
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomieq_sd:
+; X32:       # BB#0:
+; X32-NEXT:    ucomisd %xmm1, %xmm0
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    andb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomieq_sd:
+; X64:       # BB#0:
+; X64-NEXT:    ucomisd %xmm1, %xmm0
+; X64-NEXT:    setnp %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomige_sd:
+; X32:       # BB#0:
+; X32-NEXT:    ucomisd %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomige_sd:
+; X64:       # BB#0:
+; X64-NEXT:    ucomisd %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomigt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    ucomisd %xmm1, %xmm0
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomigt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    ucomisd %xmm1, %xmm0
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomile_sd:
+; X32:       # BB#0:
+; X32-NEXT:    ucomisd %xmm0, %xmm1
+; X32-NEXT:    setae %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomile_sd:
+; X64:       # BB#0:
+; X64-NEXT:    ucomisd %xmm0, %xmm1
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomilt_sd:
+; X32:       # BB#0:
+; X32-NEXT:    ucomisd %xmm0, %xmm1
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomilt_sd:
+; X64:       # BB#0:
+; X64-NEXT:    ucomisd %xmm0, %xmm1
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomineq_sd:
+; X32:       # BB#0:
+; X32-NEXT:    ucomisd %xmm1, %xmm0
+; X32-NEXT:    setp %al
+; X32-NEXT:    setne %cl
+; X32-NEXT:    orb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_ucomineq_sd:
+; X64:       # BB#0:
+; X64-NEXT:    ucomisd %xmm1, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    setne %cl
+; X64-NEXT:    orb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    retq
+  %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_undefined_pd() {
+; X32-LABEL: test_mm_undefined_pd:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_undefined_pd:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  ret <2 x double> undef
+}
+
+define <2 x i64> @test_mm_undefined_si128() {
+; X32-LABEL: test_mm_undefined_si128:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_undefined_si128:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  ret <2 x i64> undef
+}
+
+define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpackhi_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpackhi_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpackhi_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpackhi_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_unpackhi_pd:
+; X32:       # BB#0:
+; X32-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpackhi_pd:
+; X64:       # BB#0:
+; X64-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpacklo_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %bc = bitcast <16 x i8> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpacklo_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %bc = bitcast <8 x i16> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpacklo_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %bc = bitcast <4 x i32> %res to <2 x i64>
+  ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi64:
+; X32:       # BB#0:
+; X32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpacklo_epi64:
+; X64:       # BB#0:
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_unpacklo_pd:
+; X32:       # BB#0:
+; X32-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_unpacklo_pd:
+; X64:       # BB#0:
+; X64-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_xor_pd:
+; X32:       # BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_xor_pd:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+  %res = xor <4 x i32> %arg0, %arg1
+  %bc = bitcast <4 x i32> %res to <2 x double>
+  ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_xor_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_xor_si128:
+; X32:       # BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_xor_si128:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = xor <2 x i64> %a0, %a1
+  ret <2 x i64> %res
+}
+
+!0 = !{i32 1}
+




More information about the llvm-commits mailing list