[llvm] r304688 - [X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 4 13:12:05 PDT 2017
Author: rksimon
Date: Sun Jun 4 15:12:04 2017
New Revision: 304688
URL: http://llvm.org/viewvc/llvm-project?rev=304688&view=rev
Log:
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/build-vector-128.ll
llvm/trunk/test/CodeGen/X86/buildvec-insertvec.ll
llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
llvm/trunk/test/CodeGen/X86/haddsub-2.ll
llvm/trunk/test/CodeGen/X86/haddsub-undef.ll
llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll
llvm/trunk/test/CodeGen/X86/select.ll
llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/sse1.ll
llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll
llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll
llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
llvm/trunk/test/CodeGen/X86/vec_set.ll
llvm/trunk/test/CodeGen/X86/vector-rem.ll
llvm/trunk/test/CodeGen/X86/vector-sext.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll
llvm/trunk/test/CodeGen/X86/vshift-1.ll
llvm/trunk/test/CodeGen/X86/vshift-2.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Jun 4 15:12:04 2017
@@ -7825,24 +7825,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDV
}
// Next, we iteratively mix elements, e.g. for v4f32:
- // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
- // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
- // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
- unsigned EltStride = NumElems >> 1;
- while (EltStride != 0) {
- for (unsigned i = 0; i < EltStride; ++i) {
- // If Ops[i+EltStride] is undef and this is the first round of mixing,
- // then it is safe to just drop this shuffle: V[i] is already in the
- // right place, the one element (since it's the first round) being
- // inserted as undef can be dropped. This isn't safe for successive
- // rounds because they will permute elements within both vectors.
- if (Ops[i+EltStride].isUndef() &&
- EltStride == NumElems/2)
- continue;
+ // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
+ // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
+ // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
+ for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
+ // Generate scaled UNPCKL shuffle mask.
+ SmallVector<int, 16> Mask;
+ for(unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(i);
+ for (unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(NumElems+i);
+ Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
- Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
- }
- EltStride >>= 1;
+ for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
+ Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
}
return Ops[0];
}
Modified: llvm/trunk/test/CodeGen/X86/build-vector-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/build-vector-128.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/build-vector-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/build-vector-128.ll Sun Jun 4 15:12:04 2017
@@ -41,9 +41,9 @@ define <4 x float> @test_buildvector_v4f
;
; SSE2-64-LABEL: test_buildvector_v4f32:
; SSE2-64: # BB#0:
-; SSE2-64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-64-NEXT: retq
;
; SSE41-64-LABEL: test_buildvector_v4f32:
@@ -74,13 +74,9 @@ define <4 x float> @test_buildvector_v4f
define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
; SSE2-32-LABEL: test_buildvector_v2i64:
; SSE2-32: # BB#0:
-; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-32-NEXT: retl
;
; SSE-64-LABEL: test_buildvector_v2i64:
@@ -126,12 +122,12 @@ define <4 x i32> @test_buildvector_v4i32
; SSE2-64-LABEL: test_buildvector_v4i32:
; SSE2-64: # BB#0:
; SSE2-64-NEXT: movd %ecx, %xmm0
-; SSE2-64-NEXT: movd %esi, %xmm1
+; SSE2-64-NEXT: movd %edx, %xmm1
; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-64-NEXT: movd %edx, %xmm2
+; SSE2-64-NEXT: movd %esi, %xmm2
; SSE2-64-NEXT: movd %edi, %xmm0
; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-64-NEXT: retq
;
; SSE41-64-LABEL: test_buildvector_v4i32:
@@ -170,34 +166,34 @@ define <8 x i16> @test_buildvector_v8i16
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-32-NEXT: retl
;
; SSE2-64-LABEL: test_buildvector_v8i16:
; SSE2-64: # BB#0:
-; SSE2-64-NEXT: movd %ecx, %xmm0
+; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-64-NEXT: movd %r9d, %xmm1
-; SSE2-64-NEXT: movd %esi, %xmm2
-; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-64-NEXT: movd %r9d, %xmm0
+; SSE2-64-NEXT: movd %r8d, %xmm2
; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-64-NEXT: movd %ecx, %xmm0
; SSE2-64-NEXT: movd %edx, %xmm1
-; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-64-NEXT: movd %r8d, %xmm3
+; SSE2-64-NEXT: movd %esi, %xmm3
; SSE2-64-NEXT: movd %edi, %xmm0
; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-64-NEXT: retq
;
; SSE41-32-LABEL: test_buildvector_v8i16:
@@ -267,31 +263,31 @@ define <16 x i8> @test_buildvector_v16i8
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-32-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-32-NEXT: retl
;
; SSE2-64-LABEL: test_buildvector_v16i8:
@@ -299,34 +295,34 @@ define <16 x i8> @test_buildvector_v16i8
; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-64-NEXT: movd %ecx, %xmm0
-; SSE2-64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-64-NEXT: movd %r9d, %xmm1
+; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-64-NEXT: movd %esi, %xmm2
-; SSE2-64-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-64-NEXT: movd %edx, %xmm3
; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-64-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-64-NEXT: movd %r8d, %xmm1
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-64-NEXT: movd %r9d, %xmm0
+; SSE2-64-NEXT: movd %r8d, %xmm2
+; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-64-NEXT: movd %ecx, %xmm0
+; SSE2-64-NEXT: movd %edx, %xmm1
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-64-NEXT: movd %esi, %xmm4
; SSE2-64-NEXT: movd %edi, %xmm0
-; SSE2-64-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-64-NEXT: retq
;
; SSE41-32-LABEL: test_buildvector_v16i8:
Modified: llvm/trunk/test/CodeGen/X86/buildvec-insertvec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/buildvec-insertvec.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/buildvec-insertvec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/buildvec-insertvec.ll Sun Jun 4 15:12:04 2017
@@ -75,9 +75,9 @@ entry:
define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float %f2, float %f3) {
; SSE2-LABEL: test_buildvector_v4f32_register:
; SSE2: # BB#0:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_register:
@@ -102,7 +102,7 @@ define <4 x float> @test_buildvector_v4f
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_load:
@@ -126,10 +126,10 @@ define <4 x float> @test_buildvector_v4f
define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) {
; SSE2-LABEL: test_buildvector_v4f32_partial_load:
; SSE2: # BB#0:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_partial_load:
@@ -150,12 +150,12 @@ define <4 x i32> @test_buildvector_v4i32
; SSE2-LABEL: test_buildvector_v4i32_register:
; SSE2: # BB#0:
; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movd %esi, %xmm1
+; SSE2-NEXT: movd %edx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd %edx, %xmm2
+; SSE2-NEXT: movd %esi, %xmm2
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4i32_register:
@@ -178,7 +178,7 @@ define <4 x i32> @test_buildvector_v4i32
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: movd %esi, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4i32_partial:
@@ -228,21 +228,21 @@ define <4 x i32> @test_buildvector_v4i32
define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) {
; SSE2-LABEL: test_buildvector_v8i16_register:
; SSE2: # BB#0:
-; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: movd %r9d, %xmm1
-; SSE2-NEXT: movd %esi, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %r9d, %xmm0
+; SSE2-NEXT: movd %r8d, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movd %edx, %xmm1
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd %r8d, %xmm3
+; SSE2-NEXT: movd %esi, %xmm3
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v8i16_register:
@@ -333,34 +333,34 @@ define <16 x i8> @test_buildvector_v16i8
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: movd %r9d, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movd %esi, %xmm2
-; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movd %edx, %xmm3
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: movd %r8d, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movd %r9d, %xmm0
+; SSE2-NEXT: movd %r8d, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movd %edx, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movd %esi, %xmm4
; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v16i8_register:
Modified: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll Sun Jun 4 15:12:04 2017
@@ -159,27 +159,18 @@ define <8 x i32> @_clearupper8xi32a(<8 x
define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
; SSE-LABEL: _clearupper8xi16a:
; SSE: # BB#0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: pextrw $2, %xmm0, %r9d
-; SSE-NEXT: pextrw $3, %xmm0, %edx
-; SSE-NEXT: pextrw $4, %xmm0, %r8d
-; SSE-NEXT: pextrw $5, %xmm0, %edi
-; SSE-NEXT: pextrw $6, %xmm0, %esi
-; SSE-NEXT: pextrw $7, %xmm0, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm0, %eax
+; SSE-NEXT: pextrw $5, %xmm0, %ecx
+; SSE-NEXT: pextrw $6, %xmm0, %edx
+; SSE-NEXT: pextrw $7, %xmm0, %esi
+; SSE-NEXT: movd %esi, %xmm1
; SSE-NEXT: movd %edx, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: movd %edi, %xmm1
+; SSE-NEXT: movd %ecx, %xmm1
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT: movd %esi, %xmm1
-; SSE-NEXT: movd %r9d, %xmm2
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: movd %r8d, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
@@ -225,61 +216,33 @@ define <8 x i16> @_clearupper8xi16a(<8 x
define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind {
; SSE-LABEL: _clearupper16xi16a:
; SSE: # BB#0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pextrw $1, %xmm0, %edi
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: pextrw $3, %xmm0, %ecx
-; SSE-NEXT: pextrw $4, %xmm0, %edx
-; SSE-NEXT: pextrw $5, %xmm0, %esi
-; SSE-NEXT: pextrw $6, %xmm0, %ebx
-; SSE-NEXT: pextrw $7, %xmm0, %ebp
-; SSE-NEXT: pextrw $1, %xmm1, %r10d
-; SSE-NEXT: pextrw $2, %xmm1, %r9d
-; SSE-NEXT: pextrw $3, %xmm1, %r14d
+; SSE-NEXT: pextrw $4, %xmm0, %eax
+; SSE-NEXT: pextrw $5, %xmm0, %ecx
+; SSE-NEXT: pextrw $6, %xmm0, %edx
+; SSE-NEXT: pextrw $7, %xmm0, %esi
; SSE-NEXT: pextrw $4, %xmm1, %r8d
-; SSE-NEXT: pextrw $5, %xmm1, %r15d
-; SSE-NEXT: pextrw $6, %xmm1, %r11d
-; SSE-NEXT: pextrw $7, %xmm1, %r12d
-; SSE-NEXT: movd %ebp, %xmm2
-; SSE-NEXT: movd %ecx, %xmm3
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: pextrw $5, %xmm1, %r9d
+; SSE-NEXT: pextrw $6, %xmm1, %r10d
+; SSE-NEXT: pextrw $7, %xmm1, %edi
; SSE-NEXT: movd %esi, %xmm2
-; SSE-NEXT: movd %edi, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: movd %ebx, %xmm2
-; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: movd %edx, %xmm3
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT: movd %edx, %xmm2
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: movd %eax, %xmm4
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movd %r12d, %xmm3
-; SSE-NEXT: movd %r14d, %xmm4
+; SSE-NEXT: movd %edi, %xmm3
+; SSE-NEXT: movd %r10d, %xmm4
; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: movd %r15d, %xmm3
-; SSE-NEXT: movd %r10d, %xmm5
+; SSE-NEXT: movd %r9d, %xmm3
+; SSE-NEXT: movd %r8d, %xmm5
; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE-NEXT: movd %r11d, %xmm3
-; SSE-NEXT: movd %r9d, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: movd %r8d, %xmm3
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper16xi16a:
@@ -364,10 +327,9 @@ define <16 x i8> @_clearupper16xi8a(<16
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -375,31 +337,32 @@ define <16 x i8> @_clearupper16xi8a(<16
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
@@ -486,10 +449,9 @@ define <32 x i8> @_clearupper32xi8a(<32
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -497,31 +459,32 @@ define <32 x i8> @_clearupper32xi8a(<32
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -531,10 +494,9 @@ define <32 x i8> @_clearupper32xi8a(<32
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm4
+; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -542,31 +504,32 @@ define <32 x i8> @_clearupper32xi8a(<32
; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: movd %eax, %xmm6
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/haddsub-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-2.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-2.ll Sun Jun 4 15:12:04 2017
@@ -142,12 +142,12 @@ define <4 x i32> @phadd_d_test1(<4 x i32
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: addl %eax, %edi
; SSE3-NEXT: movd %edi, %xmm0
-; SSE3-NEXT: movd %edx, %xmm1
+; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: movd %esi, %xmm2
+; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: movd %ecx, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phadd_d_test1:
@@ -196,16 +196,16 @@ define <4 x i32> @phadd_d_test2(<4 x i32
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: movd %esi, %xmm0
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: movd %xmm1, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %esi, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %ecx, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: movd %xmm1, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movd %ecx, %xmm1
; SSE3-NEXT: movd %edx, %xmm0
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phadd_d_test2:
@@ -258,12 +258,12 @@ define <4 x i32> @phsub_d_test1(<4 x i32
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: subl %edi, %esi
; SSE3-NEXT: movd %esi, %xmm0
-; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: movd %edx, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: movd %edx, %xmm2
+; SSE3-NEXT: movd %ecx, %xmm2
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phsub_d_test1:
@@ -312,16 +312,16 @@ define <4 x i32> @phsub_d_test2(<4 x i32
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: subl %esi, %edx
; SSE3-NEXT: movd %edx, %xmm0
+; SSE3-NEXT: movd %xmm1, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm1, %esi
+; SSE3-NEXT: subl %esi, %edx
+; SSE3-NEXT: movd %edx, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE3-NEXT: movd %xmm1, %eax
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE3-NEXT: movd %xmm0, %edx
-; SSE3-NEXT: subl %edx, %eax
-; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movd %ecx, %xmm0
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phsub_d_test2:
@@ -518,19 +518,19 @@ define <8 x i32> @avx2_vphadd_d_test(<8
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %r9d
; SSE3-NEXT: addl %edx, %r9d
-; SSE3-NEXT: movd %xmm1, %esi
+; SSE3-NEXT: movd %xmm1, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE3-NEXT: movd %xmm0, %r10d
-; SSE3-NEXT: addl %esi, %r10d
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %edx, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
-; SSE3-NEXT: addl %esi, %edi
+; SSE3-NEXT: addl %edx, %edi
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE3-NEXT: movd %xmm0, %r11d
-; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
@@ -541,24 +541,24 @@ define <8 x i32> @avx2_vphadd_d_test(<8
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: movd %xmm0, %r11d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
-; SSE3-NEXT: movd %xmm0, %esi
-; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: addl %r11d, %eax
; SSE3-NEXT: movd %edi, %xmm0
-; SSE3-NEXT: movd %r9d, %xmm1
+; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r9d, %xmm2
; SSE3-NEXT: movd %r8d, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT: movd %esi, %xmm1
-; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT: movd %edx, %xmm3
-; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm3
+; SSE3-NEXT: movd %r10d, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_d_test:
@@ -658,83 +658,83 @@ define <16 x i16> @avx2_vphadd_w_test(<1
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSE3-NEXT: pextrw $3, %xmm0, %r11d
-; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pextrw $3, %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSE3-NEXT: pextrw $5, %xmm0, %r10d
-; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: pextrw $5, %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
; SSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSE3-NEXT: pextrw $7, %xmm0, %r13d
-; SSE3-NEXT: addl %eax, %r13d
+; SSE3-NEXT: pextrw $7, %xmm0, %r15d
+; SSE3-NEXT: addl %eax, %r15d
; SSE3-NEXT: movd %xmm1, %eax
-; SSE3-NEXT: pextrw $1, %xmm1, %r14d
-; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $1, %xmm1, %r13d
+; SSE3-NEXT: addl %eax, %r13d
; SSE3-NEXT: pextrw $2, %xmm1, %eax
-; SSE3-NEXT: pextrw $3, %xmm1, %ebp
-; SSE3-NEXT: addl %eax, %ebp
-; SSE3-NEXT: pextrw $4, %xmm1, %eax
-; SSE3-NEXT: pextrw $5, %xmm1, %ebx
+; SSE3-NEXT: pextrw $3, %xmm1, %ebx
; SSE3-NEXT: addl %eax, %ebx
+; SSE3-NEXT: pextrw $4, %xmm1, %eax
+; SSE3-NEXT: pextrw $5, %xmm1, %r8d
+; SSE3-NEXT: addl %eax, %r8d
; SSE3-NEXT: pextrw $6, %xmm1, %eax
-; SSE3-NEXT: pextrw $7, %xmm1, %edx
-; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pextrw $7, %xmm1, %esi
+; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: movd %xmm2, %eax
-; SSE3-NEXT: pextrw $1, %xmm2, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $1, %xmm2, %r10d
+; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pextrw $2, %xmm2, %eax
-; SSE3-NEXT: pextrw $3, %xmm2, %r12d
-; SSE3-NEXT: addl %eax, %r12d
+; SSE3-NEXT: pextrw $3, %xmm2, %r14d
+; SSE3-NEXT: addl %eax, %r14d
; SSE3-NEXT: pextrw $4, %xmm2, %eax
-; SSE3-NEXT: pextrw $5, %xmm2, %r15d
-; SSE3-NEXT: addl %eax, %r15d
+; SSE3-NEXT: pextrw $5, %xmm2, %r12d
+; SSE3-NEXT: addl %eax, %r12d
; SSE3-NEXT: pextrw $6, %xmm2, %eax
-; SSE3-NEXT: pextrw $7, %xmm2, %r8d
-; SSE3-NEXT: addl %eax, %r8d
-; SSE3-NEXT: movd %xmm3, %eax
-; SSE3-NEXT: pextrw $1, %xmm3, %r9d
+; SSE3-NEXT: pextrw $7, %xmm2, %r9d
; SSE3-NEXT: addl %eax, %r9d
-; SSE3-NEXT: pextrw $2, %xmm3, %eax
-; SSE3-NEXT: pextrw $3, %xmm3, %esi
-; SSE3-NEXT: addl %eax, %esi
-; SSE3-NEXT: pextrw $4, %xmm3, %eax
-; SSE3-NEXT: pextrw $5, %xmm3, %edi
-; SSE3-NEXT: addl %eax, %edi
-; SSE3-NEXT: pextrw $6, %xmm3, %ecx
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pextrw $1, %xmm3, %ebp
+; SSE3-NEXT: addl %eax, %ebp
+; SSE3-NEXT: pextrw $2, %xmm3, %edx
+; SSE3-NEXT: pextrw $3, %xmm3, %edi
+; SSE3-NEXT: addl %edx, %edi
+; SSE3-NEXT: pextrw $4, %xmm3, %edx
+; SSE3-NEXT: pextrw $5, %xmm3, %ecx
+; SSE3-NEXT: addl %edx, %ecx
+; SSE3-NEXT: pextrw $6, %xmm3, %edx
; SSE3-NEXT: pextrw $7, %xmm3, %eax
-; SSE3-NEXT: addl %ecx, %eax
-; SSE3-NEXT: movd %edx, %xmm8
-; SSE3-NEXT: movd %r13d, %xmm3
-; SSE3-NEXT: movd %ebp, %xmm9
-; SSE3-NEXT: movd %r11d, %xmm4
-; SSE3-NEXT: movd %ebx, %xmm10
-; SSE3-NEXT: movd %r10d, %xmm7
-; SSE3-NEXT: movd %r14d, %xmm11
+; SSE3-NEXT: addl %edx, %eax
+; SSE3-NEXT: movd %esi, %xmm8
+; SSE3-NEXT: movd %r8d, %xmm3
+; SSE3-NEXT: movd %ebx, %xmm9
+; SSE3-NEXT: movd %r13d, %xmm4
+; SSE3-NEXT: movd %r15d, %xmm10
+; SSE3-NEXT: movd %r11d, %xmm7
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm11 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movd %r8d, %xmm6
-; SSE3-NEXT: movd %esi, %xmm13
-; SSE3-NEXT: movd %r12d, %xmm5
-; SSE3-NEXT: movd %edi, %xmm14
-; SSE3-NEXT: movd %r15d, %xmm2
-; SSE3-NEXT: movd %r9d, %xmm15
-; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
-; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd %ecx, %xmm6
+; SSE3-NEXT: movd %edi, %xmm13
+; SSE3-NEXT: movd %ebp, %xmm5
+; SSE3-NEXT: movd %r9d, %xmm14
+; SSE3-NEXT: movd %r12d, %xmm2
+; SSE3-NEXT: movd %r14d, %xmm15
+; SSE3-NEXT: movd %r10d, %xmm1
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
; SSE3-NEXT: popq %r12
; SSE3-NEXT: popq %r13
@@ -858,12 +858,12 @@ define <4 x i32> @not_a_hsub_1(<4 x i32>
; SSE-NEXT: movd %xmm0, %edi
; SSE-NEXT: subl %edi, %esi
; SSE-NEXT: movd %esi, %xmm0
-; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: movd %edx, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movd %edx, %xmm2
+; SSE-NEXT: movd %ecx, %xmm2
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_1:
@@ -919,11 +919,11 @@ define <4 x float> @not_a_hsub_2(<4 x fl
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
; SSE-NEXT: subss %xmm4, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: subss %xmm3, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT: subss %xmm4, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_2:
@@ -1162,19 +1162,19 @@ define <8 x i32> @avx2_hadd_d(<8 x i32>
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %r9d
; SSE3-NEXT: addl %edx, %r9d
-; SSE3-NEXT: movd %xmm2, %esi
+; SSE3-NEXT: movd %xmm2, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE3-NEXT: movd %xmm0, %r10d
-; SSE3-NEXT: addl %esi, %r10d
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %edx, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
-; SSE3-NEXT: addl %esi, %edi
+; SSE3-NEXT: addl %edx, %edi
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE3-NEXT: movd %xmm0, %r11d
-; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
@@ -1185,24 +1185,24 @@ define <8 x i32> @avx2_hadd_d(<8 x i32>
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: movd %xmm0, %r11d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
-; SSE3-NEXT: movd %xmm0, %esi
-; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: addl %r11d, %eax
; SSE3-NEXT: movd %edi, %xmm0
-; SSE3-NEXT: movd %r9d, %xmm1
+; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r9d, %xmm2
; SSE3-NEXT: movd %r8d, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT: movd %esi, %xmm1
-; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT: movd %edx, %xmm3
-; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm3
+; SSE3-NEXT: movd %r10d, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_d:
@@ -1293,15 +1293,14 @@ define <16 x i16> @avx2_hadd_w(<16 x i16
; SSE3-NEXT: .Lcfi23:
; SSE3-NEXT: .cfi_offset %rbp, -16
; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: pextrw $1, %xmm0, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $1, %xmm0, %r10d
+; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSE3-NEXT: pextrw $3, %xmm0, %r15d
-; SSE3-NEXT: addl %eax, %r15d
+; SSE3-NEXT: pextrw $3, %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
; SSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSE3-NEXT: pextrw $5, %xmm0, %r14d
-; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $5, %xmm0, %r12d
+; SSE3-NEXT: addl %eax, %r12d
; SSE3-NEXT: pextrw $6, %xmm0, %eax
; SSE3-NEXT: pextrw $7, %xmm0, %r13d
; SSE3-NEXT: addl %eax, %r13d
@@ -1310,70 +1309,71 @@ define <16 x i16> @avx2_hadd_w(<16 x i16
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $2, %xmm1, %eax
-; SSE3-NEXT: pextrw $3, %xmm1, %r11d
-; SSE3-NEXT: addl %eax, %r11d
-; SSE3-NEXT: pextrw $4, %xmm1, %eax
-; SSE3-NEXT: pextrw $5, %xmm1, %r10d
-; SSE3-NEXT: addl %eax, %r10d
-; SSE3-NEXT: pextrw $6, %xmm1, %eax
-; SSE3-NEXT: pextrw $7, %xmm1, %r12d
-; SSE3-NEXT: addl %eax, %r12d
-; SSE3-NEXT: movd %xmm2, %eax
-; SSE3-NEXT: pextrw $1, %xmm2, %ebx
-; SSE3-NEXT: addl %eax, %ebx
-; SSE3-NEXT: pextrw $2, %xmm2, %eax
-; SSE3-NEXT: pextrw $3, %xmm2, %ecx
+; SSE3-NEXT: pextrw $3, %xmm1, %ecx
; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $4, %xmm1, %eax
+; SSE3-NEXT: pextrw $5, %xmm1, %r14d
+; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $6, %xmm1, %esi
+; SSE3-NEXT: pextrw $7, %xmm1, %r15d
+; SSE3-NEXT: addl %esi, %r15d
+; SSE3-NEXT: movd %xmm2, %esi
+; SSE3-NEXT: pextrw $1, %xmm2, %ebp
+; SSE3-NEXT: addl %esi, %ebp
+; SSE3-NEXT: pextrw $2, %xmm2, %esi
+; SSE3-NEXT: pextrw $3, %xmm2, %edi
+; SSE3-NEXT: addl %esi, %edi
; SSE3-NEXT: pextrw $4, %xmm2, %esi
-; SSE3-NEXT: pextrw $5, %xmm2, %r8d
-; SSE3-NEXT: addl %esi, %r8d
+; SSE3-NEXT: pextrw $5, %xmm2, %eax
+; SSE3-NEXT: addl %esi, %eax
; SSE3-NEXT: pextrw $6, %xmm2, %esi
-; SSE3-NEXT: pextrw $7, %xmm2, %edx
-; SSE3-NEXT: addl %esi, %edx
-; SSE3-NEXT: movd %xmm3, %edi
+; SSE3-NEXT: pextrw $7, %xmm2, %ecx
+; SSE3-NEXT: addl %esi, %ecx
+; SSE3-NEXT: movd %xmm3, %ebx
; SSE3-NEXT: pextrw $1, %xmm3, %r9d
-; SSE3-NEXT: addl %edi, %r9d
-; SSE3-NEXT: pextrw $2, %xmm3, %ebp
-; SSE3-NEXT: pextrw $3, %xmm3, %edi
-; SSE3-NEXT: addl %ebp, %edi
-; SSE3-NEXT: pextrw $4, %xmm3, %eax
-; SSE3-NEXT: pextrw $5, %xmm3, %ebp
-; SSE3-NEXT: addl %eax, %ebp
-; SSE3-NEXT: pextrw $6, %xmm3, %esi
-; SSE3-NEXT: pextrw $7, %xmm3, %eax
-; SSE3-NEXT: addl %esi, %eax
-; SSE3-NEXT: movd %edx, %xmm8
-; SSE3-NEXT: movd %r13d, %xmm3
-; SSE3-NEXT: movd %ecx, %xmm9
-; SSE3-NEXT: movd %r15d, %xmm4
-; SSE3-NEXT: movd %r8d, %xmm10
-; SSE3-NEXT: movd %r14d, %xmm7
-; SSE3-NEXT: movd %ebx, %xmm11
-; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
-; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movd %r12d, %xmm6
-; SSE3-NEXT: movd %edi, %xmm13
-; SSE3-NEXT: movd %r11d, %xmm5
-; SSE3-NEXT: movd %ebp, %xmm14
-; SSE3-NEXT: movd %r10d, %xmm2
-; SSE3-NEXT: movd %r9d, %xmm15
+; SSE3-NEXT: addl %ebx, %r9d
+; SSE3-NEXT: pextrw $2, %xmm3, %edx
+; SSE3-NEXT: pextrw $3, %xmm3, %ebx
+; SSE3-NEXT: addl %edx, %ebx
+; SSE3-NEXT: pextrw $4, %xmm3, %edx
+; SSE3-NEXT: pextrw $5, %xmm3, %esi
+; SSE3-NEXT: addl %edx, %esi
+; SSE3-NEXT: pextrw $6, %xmm3, %r8d
+; SSE3-NEXT: pextrw $7, %xmm3, %edx
+; SSE3-NEXT: addl %r8d, %edx
+; SSE3-NEXT: movd %ecx, %xmm8
+; SSE3-NEXT: movd %eax, %xmm3
+; SSE3-NEXT: movd %edi, %xmm9
+; SSE3-NEXT: movd %ebp, %xmm4
+; SSE3-NEXT: movd %r13d, %xmm10
+; SSE3-NEXT: movd %r12d, %xmm7
+; SSE3-NEXT: movd %r11d, %xmm11
+; SSE3-NEXT: movd %r10d, %xmm0
+; SSE3-NEXT: movd %edx, %xmm12
+; SSE3-NEXT: movd %esi, %xmm6
+; SSE3-NEXT: movd %ebx, %xmm13
+; SSE3-NEXT: movd %r9d, %xmm5
+; SSE3-NEXT: movd %r15d, %xmm14
+; SSE3-NEXT: movd %r14d, %xmm2
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm15 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
; SSE3-NEXT: popq %r12
; SSE3-NEXT: popq %r13
Modified: llvm/trunk/test/CodeGen/X86/haddsub-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-undef.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-undef.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-undef.ll Sun Jun 4 15:12:04 2017
@@ -171,9 +171,8 @@ define <4 x float> @test8_undef(<4 x flo
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: addss %xmm2, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test8_undef:
Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll Sun Jun 4 15:12:04 2017
@@ -269,10 +269,8 @@ define <4 x float> @merge_4f32_f32_012u(
; SSE2-LABEL: merge_4f32_f32_012u:
; SSE2: # BB#0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_012u:
@@ -290,11 +288,11 @@ define <4 x float> @merge_4f32_f32_012u(
; X32-SSE1-LABEL: merge_4f32_f32_012u:
; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_012u:
@@ -320,10 +318,8 @@ define <4 x float> @merge_4f32_f32_019u(
; SSE2-LABEL: merge_4f32_f32_019u:
; SSE2: # BB#0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_019u:
@@ -341,11 +337,11 @@ define <4 x float> @merge_4f32_f32_019u(
; X32-SSE1-LABEL: merge_4f32_f32_019u:
; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_019u:
@@ -1037,13 +1033,11 @@ define <2 x i64> @merge_2i64_i64_12_vola
define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_4f32_f32_2345_volatile:
; SSE2: # BB#0:
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_2345_volatile:
@@ -1065,13 +1059,13 @@ define <4 x float> @merge_4f32_f32_2345_
; X32-SSE1-LABEL: merge_4f32_f32_2345_volatile:
; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
Modified: llvm/trunk/test/CodeGen/X86/select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/select.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/select.ll (original)
+++ llvm/trunk/test/CodeGen/X86/select.ll Sun Jun 4 15:12:04 2017
@@ -314,13 +314,13 @@ define void @test8(i1 %c, <6 x i32>* %ds
; GENERIC-NEXT: jmp LBB7_6
; GENERIC-NEXT: LBB7_4:
; GENERIC-NEXT: movd %r9d, %xmm1
-; GENERIC-NEXT: movd %ecx, %xmm2
+; GENERIC-NEXT: movd %r8d, %xmm2
; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; GENERIC-NEXT: movd %r8d, %xmm3
+; GENERIC-NEXT: movd %ecx, %xmm3
; GENERIC-NEXT: movd %edx, %xmm1
; GENERIC-NEXT: LBB7_6:
; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0
; GENERIC-NEXT: movq %xmm0, 16(%rsi)
@@ -350,16 +350,16 @@ define void @test8(i1 %c, <6 x i32>* %ds
; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; ATOM-NEXT: jmp LBB7_6
; ATOM-NEXT: LBB7_4:
; ATOM-NEXT: movd %r9d, %xmm1
-; ATOM-NEXT: movd %ecx, %xmm2
+; ATOM-NEXT: movd %r8d, %xmm2
; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; ATOM-NEXT: movd %r8d, %xmm3
+; ATOM-NEXT: movd %ecx, %xmm3
; ATOM-NEXT: movd %edx, %xmm1
; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; ATOM-NEXT: LBB7_6:
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1
Modified: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll Sun Jun 4 15:12:04 2017
@@ -53,17 +53,17 @@ define <4 x float> @test_mm_and_ps(<4 x
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: leal -4(%ebp), %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebp
@@ -86,18 +86,18 @@ define <4 x float> @test_mm_and_ps(<4 x
; X64-NEXT: shrq $32, %rsi
; X64-NEXT: shrq $32, %rdi
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl %r8d, %edi
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl %eax, %esi
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -121,15 +121,15 @@ define <4 x float> @test_mm_andnot_ps(<4
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-NEXT: notl %edx
-; X32-NEXT: notl %ecx
; X32-NEXT: notl %esi
+; X32-NEXT: notl %ecx
; X32-NEXT: notl %eax
; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, (%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -138,7 +138,7 @@ define <4 x float> @test_mm_andnot_ps(<4
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: leal -4(%ebp), %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebp
@@ -165,18 +165,18 @@ define <4 x float> @test_mm_andnot_ps(<4
; X64-NEXT: notl %esi
; X64-NEXT: notl %edx
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl %r8d, %edx
; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl %edi, %esi
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -1277,17 +1277,17 @@ define <4 x float> @test_mm_or_ps(<4 x f
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: orl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: leal -4(%ebp), %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebp
@@ -1310,18 +1310,18 @@ define <4 x float> @test_mm_or_ps(<4 x f
; X64-NEXT: shrq $32, %rsi
; X64-NEXT: shrq $32, %rdi
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
; X64-NEXT: orl %r8d, %edi
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
; X64-NEXT: orl %eax, %esi
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -1538,16 +1538,16 @@ define <4 x float> @test_mm_set_ps(float
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_ps:
; X64: # BB#0:
-; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; X64-NEXT: movaps %xmm3, %xmm0
; X64-NEXT: retq
%res0 = insertelement <4 x float> undef, float %a3, i32 0
@@ -1677,16 +1677,16 @@ define <4 x float> @test_mm_setr_ps(floa
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_ps:
; X64: # BB#0:
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
%res0 = insertelement <4 x float> undef, float %a0, i32 0
%res1 = insertelement <4 x float> %res0, float %a1, i32 1
@@ -2239,17 +2239,17 @@ define <4 x float> @test_mm_xor_ps(<4 x
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: leal -4(%ebp), %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebp
@@ -2272,18 +2272,18 @@ define <4 x float> @test_mm_xor_ps(<4 x
; X64-NEXT: shrq $32, %rsi
; X64-NEXT: shrq $32, %rdi
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
; X64-NEXT: xorl %r8d, %edi
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
; X64-NEXT: xorl %eax, %esi
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
Modified: llvm/trunk/test/CodeGen/X86/sse1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse1.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse1.ll Sun Jun 4 15:12:04 2017
@@ -87,17 +87,17 @@ define <4 x float> @vselect(<4 x float>*
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: .LBB1_11: # %entry
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X32-NEXT: retl
;
; X64-LABEL: vselect:
; X64: # BB#0: # %entry
-; X64-NEXT: testl %ecx, %ecx
+; X64-NEXT: testl %edx, %edx
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: je .LBB1_1
; X64-NEXT: # BB#2: # %entry
; X64-NEXT: xorps %xmm1, %xmm1
-; X64-NEXT: testl %edx, %edx
+; X64-NEXT: testl %ecx, %ecx
; X64-NEXT: jne .LBB1_5
; X64-NEXT: .LBB1_4:
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -111,7 +111,7 @@ define <4 x float> @vselect(<4 x float>*
; X64-NEXT: jmp .LBB1_11
; X64-NEXT: .LBB1_1:
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: testl %edx, %edx
+; X64-NEXT: testl %ecx, %ecx
; X64-NEXT: je .LBB1_4
; X64-NEXT: .LBB1_5: # %entry
; X64-NEXT: xorps %xmm2, %xmm2
@@ -126,7 +126,7 @@ define <4 x float> @vselect(<4 x float>*
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: .LBB1_11: # %entry
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
entry:
%a1 = icmp eq <4 x i32> %q, zeroinitializer
@@ -252,12 +252,12 @@ define <2 x float> @PR31672() #0 {
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: notl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: orl %edx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl %eax, %ecx
+; X32-NEXT: notl %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: andl %ecx, %edx
@@ -277,7 +277,7 @@ define <2 x float> @PR31672() #0 {
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: retl
@@ -297,48 +297,48 @@ define <2 x float> @PR31672() #0 {
; X64-NEXT: mulps %xmm1, %xmm0
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r9
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r10
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
-; X64-NEXT: movl %r9d, %esi
-; X64-NEXT: andl %edi, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl %edi, %eax
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: notl %ecx
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r10
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: andl %eax, %ecx
-; X64-NEXT: orl %esi, %ecx
+; X64-NEXT: andl %edx, %ecx
+; X64-NEXT: orl %eax, %ecx
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: andl %r10d, %ecx
-; X64-NEXT: movl %r10d, %esi
-; X64-NEXT: notl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: orl %ecx, %esi
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrq $32, %r9
+; X64-NEXT: shrq $32, %rsi
; X64-NEXT: shrq $32, %rdi
-; X64-NEXT: andl %edi, %r9d
+; X64-NEXT: andl %edi, %esi
; X64-NEXT: notl %edi
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: andl %edi, %eax
-; X64-NEXT: orl %r9d, %eax
-; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrq $32, %r8
-; X64-NEXT: shrq $32, %r10
-; X64-NEXT: andl %r10d, %r8d
-; X64-NEXT: notl %r10d
; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: andl %r10d, %edx
-; X64-NEXT: orl %r8d, %edx
+; X64-NEXT: andl %edi, %edx
+; X64-NEXT: orl %esi, %edx
; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: andl %r9d, %eax
+; X64-NEXT: movl %r9d, %ecx
+; X64-NEXT: notl %ecx
+; X64-NEXT: andl %r10d, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %r9
+; X64-NEXT: andl %r9d, %r8d
+; X64-NEXT: notl %r9d
+; X64-NEXT: shrq $32, %r10
+; X64-NEXT: andl %r9d, %r10d
+; X64-NEXT: orl %r8d, %r10d
+; X64-NEXT: movl %r10d, -{{[0-9]+}}(%rsp)
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%t0 = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> <float 42.0, float 3.0>)
ret <2 x float> %t0
Modified: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll Sun Jun 4 15:12:04 2017
@@ -2076,7 +2076,7 @@ define <2 x i64> @test_mm_set_epi8(i8 %a
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm2
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2087,8 +2087,8 @@ define <2 x i64> @test_mm_set_epi8(i8 %a
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm1
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2099,7 +2099,7 @@ define <2 x i64> @test_mm_set_epi8(i8 %a
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm3
; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2110,27 +2110,27 @@ define <2 x i64> @test_mm_set_epi8(i8 %a
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi8:
; X64: # BB#0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movzbl %dl, %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl %r9b, %eax
; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
@@ -2138,20 +2138,20 @@ define <2 x i64> @test_mm_set_epi8(i8 %a
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm2
@@ -2161,9 +2161,9 @@ define <2 x i64> @test_mm_set_epi8(i8 %a
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <16 x i8> undef, i8 %a15, i32 0
%res1 = insertelement <16 x i8> %res0, i8 %a14, i32 1
@@ -2206,11 +2206,11 @@ define <2 x i64> @test_mm_set_epi16(i16
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi16:
@@ -2218,20 +2218,20 @@ define <2 x i64> @test_mm_set_epi16(i16
; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: movd %r8d, %xmm1
+; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: movd %edx, %xmm0
-; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: movd %ecx, %xmm2
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-NEXT: movd %esi, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movd %r8d, %xmm0
; X64-NEXT: movd %r9d, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: movd %ecx, %xmm3
+; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: movd %r10d, %xmm0
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
%res0 = insertelement <8 x i16> undef, i16 %a7, i32 0
%res1 = insertelement <8 x i16> %res0, i16 %a6, i32 1
@@ -2254,18 +2254,18 @@ define <2 x i64> @test_mm_set_epi32(i32
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi32:
; X64: # BB#0:
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: movd %edx, %xmm2
; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <4 x i32> undef, i32 %a3, i32 0
%res1 = insertelement <4 x i32> %res0, i32 %a2, i32 1
@@ -2282,11 +2282,11 @@ define <2 x i64> @test_mm_set_epi64x(i64
; X32: # BB#0:
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi64x:
@@ -2441,10 +2441,9 @@ define <2 x i64> @test_mm_set1_epi64x(i6
; X32-LABEL: test_mm_set1_epi64x:
; X32: # BB#0:
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi64x:
@@ -2486,7 +2485,7 @@ define <2 x i64> @test_mm_setr_epi8(i8 %
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm2
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2497,8 +2496,8 @@ define <2 x i64> @test_mm_setr_epi8(i8 %
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm1
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2509,7 +2508,7 @@ define <2 x i64> @test_mm_setr_epi8(i8 %
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm3
; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2520,9 +2519,9 @@ define <2 x i64> @test_mm_setr_epi8(i8 %
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi8:
@@ -2534,46 +2533,46 @@ define <2 x i64> @test_mm_setr_epi8(i8 %
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl %r9b, %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movzbl %dl, %eax
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <16 x i8> undef, i8 %a0 , i32 0
%res1 = insertelement <16 x i8> %res0, i8 %a1 , i32 1
@@ -2616,11 +2615,11 @@ define <2 x i64> @test_mm_setr_epi16(i16
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi16:
@@ -2628,20 +2627,20 @@ define <2 x i64> @test_mm_setr_epi16(i16
; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movd %ecx, %xmm1
+; X64-NEXT: movd %r10d, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: movd %r9d, %xmm0
-; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: movd %r8d, %xmm2
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-NEXT: movd %r10d, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: movd %edx, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: movd %r8d, %xmm3
+; X64-NEXT: movd %esi, %xmm3
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
%res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
%res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1
@@ -2664,18 +2663,18 @@ define <2 x i64> @test_mm_setr_epi32(i32
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi32:
; X64: # BB#0:
; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: movd %edx, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movd %edx, %xmm2
+; X64-NEXT: movd %esi, %xmm2
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
%res1 = insertelement <4 x i32> %res0, i32 %a1, i32 1
@@ -2692,11 +2691,11 @@ define <2 x i64> @test_mm_setr_epi64x(i6
; X32: # BB#0:
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi64x:
Modified: llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll Sun Jun 4 15:12:04 2017
@@ -342,9 +342,8 @@ define <4 x float> @test14(<4 x float> %
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: subss %xmm1, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,1,3]
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test14:
@@ -375,8 +374,7 @@ define <4 x float> @test15(<4 x float> %
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: addss %xmm0, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,2,1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -417,10 +415,10 @@ define <4 x float> @test16(<4 x float> %
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: addss %xmm0, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test16:
Modified: llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll Sun Jun 4 15:12:04 2017
@@ -1320,17 +1320,17 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4
; SSE-NEXT: cvttss2si %xmm1, %rax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: cvttss2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -1560,33 +1560,33 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT: cvttss2si %xmm3, %rax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE-NEXT: cvttss2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
; SSE-NEXT: cvttss2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT: cvttss2si %xmm3, %rax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE-NEXT: cvttss2si %xmm1, %rax
; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm1, %rax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll Sun Jun 4 15:12:04 2017
@@ -1169,16 +1169,16 @@ define <4 x float> @sitofp_2i64_to_4f32_
define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f32_undef:
; SSE: # BB#0:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -1368,21 +1368,22 @@ define <4 x float> @sitofp_4i64_to_4f32(
; SSE-LABEL: sitofp_4i64_to_4f32:
; SSE: # BB#0:
; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_4i64_to_4f32:
@@ -1838,21 +1839,14 @@ define <4 x float> @uitofp_4i64_to_4f32_
; SSE-LABEL: uitofp_4i64_to_4f32_undef:
; SSE: # BB#0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: testq %rax, %rax
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: js .LBB41_2
-; SSE-NEXT: # BB#1:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: .LBB41_2:
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
-; SSE-NEXT: js .LBB41_3
-; SSE-NEXT: # BB#4:
+; SSE-NEXT: js .LBB41_1
+; SSE-NEXT: # BB#2:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: jmp .LBB41_5
-; SSE-NEXT: .LBB41_3:
+; SSE-NEXT: jmp .LBB41_3
+; SSE-NEXT: .LBB41_1:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
@@ -1860,17 +1854,16 @@ define <4 x float> @uitofp_4i64_to_4f32_
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
-; SSE-NEXT: .LBB41_5:
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: .LBB41_3:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
-; SSE-NEXT: js .LBB41_6
-; SSE-NEXT: # BB#7:
+; SSE-NEXT: js .LBB41_4
+; SSE-NEXT: # BB#5:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: jmp .LBB41_8
-; SSE-NEXT: .LBB41_6:
+; SSE-NEXT: jmp .LBB41_6
+; SSE-NEXT: .LBB41_4:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
@@ -1878,9 +1871,16 @@ define <4 x float> @uitofp_4i64_to_4f32_
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
-; SSE-NEXT: .LBB41_8:
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: .LBB41_6:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: js .LBB41_8
+; SSE-NEXT: # BB#7:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: .LBB41_8:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_4i64_to_4f32_undef:
@@ -2149,32 +2149,32 @@ define <4 x float> @uitofp_4i64_to_4f32(
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_1
; SSE-NEXT: # BB#2:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB47_3
; SSE-NEXT: .LBB47_1:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB47_3:
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_4
; SSE-NEXT: # BB#5:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB47_6
; SSE-NEXT: .LBB47_4:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: addss %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB47_6:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_7
; SSE-NEXT: # BB#8:
@@ -2208,9 +2208,9 @@ define <4 x float> @uitofp_4i64_to_4f32(
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB47_12:
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i64_to_4f32:
@@ -3381,22 +3381,23 @@ define <4 x float> @sitofp_load_4i64_to_
; SSE-LABEL: sitofp_load_4i64_to_4f32:
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa 16(%rdi), %xmm2
-; SSE-NEXT: movq %xmm2, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE-NEXT: movq %xmm2, %rax
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_4i64_to_4f32:
@@ -3546,41 +3547,42 @@ define <8 x float> @sitofp_load_8i64_to_
; SSE-LABEL: sitofp_load_8i64_to_8f32:
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa 16(%rdi), %xmm2
-; SSE-NEXT: movdqa 32(%rdi), %xmm3
-; SSE-NEXT: movdqa 48(%rdi), %xmm4
-; SSE-NEXT: movq %xmm2, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
+; SSE-NEXT: movdqa 48(%rdi), %xmm3
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE-NEXT: movq %xmm2, %rax
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq %xmm4, %rax
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE-NEXT: movq %xmm3, %rax
+; SSE-NEXT: xorps %xmm4, %xmm4
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: movq %xmm2, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE-NEXT: movq %xmm3, %rax
-; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_8i64_to_8f32:
@@ -3822,73 +3824,73 @@ define <8 x float> @sitofp_load_8i8_to_8
define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-LABEL: uitofp_load_4i64_to_4f32:
; SSE: # BB#0:
-; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa 16(%rdi), %xmm3
-; SSE-NEXT: movq %xmm3, %rax
+; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_1
; SSE-NEXT: # BB#2:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB76_3
; SSE-NEXT: .LBB76_1:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm2
-; SSE-NEXT: addss %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB76_3:
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_4
; SSE-NEXT: # BB#5:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB76_6
; SSE-NEXT: .LBB76_4:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB76_6:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE-NEXT: movq %xmm3, %rax
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_7
; SSE-NEXT: # BB#8:
-; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB76_9
; SSE-NEXT: .LBB76_7:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB76_9:
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_10
; SSE-NEXT: # BB#11:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB76_12
; SSE-NEXT: .LBB76_10:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: addss %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB76_12:
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_4i64_to_4f32:
@@ -4186,121 +4188,121 @@ define <4 x float> @uitofp_load_4i8_to_4
define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-LABEL: uitofp_load_8i64_to_8f32:
; SSE: # BB#0:
-; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa 16(%rdi), %xmm5
+; SSE-NEXT: movdqa (%rdi), %xmm5
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movdqa 32(%rdi), %xmm2
-; SSE-NEXT: movdqa 48(%rdi), %xmm3
-; SSE-NEXT: movq %xmm5, %rax
+; SSE-NEXT: movdqa 48(%rdi), %xmm1
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_1
; SSE-NEXT: # BB#2:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm4
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB80_3
; SSE-NEXT: .LBB80_1:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm4
-; SSE-NEXT: addss %xmm4, %xmm4
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB80_3:
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_4
; SSE-NEXT: # BB#5:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
; SSE-NEXT: jmp .LBB80_6
; SSE-NEXT: .LBB80_4:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
+; SSE-NEXT: addss %xmm4, %xmm4
; SSE-NEXT: .LBB80_6:
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
; SSE-NEXT: movq %xmm5, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_7
; SSE-NEXT: # BB#8:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm6
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB80_9
; SSE-NEXT: .LBB80_7:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm6
-; SSE-NEXT: addss %xmm6, %xmm6
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB80_9:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE-NEXT: movq %xmm5, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_10
; SSE-NEXT: # BB#11:
-; SSE-NEXT: xorps %xmm5, %xmm5
-; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm6
; SSE-NEXT: jmp .LBB80_12
; SSE-NEXT: .LBB80_10:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: xorps %xmm5, %xmm5
-; SSE-NEXT: cvtsi2ssq %rax, %xmm5
-; SSE-NEXT: addss %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm6
+; SSE-NEXT: addss %xmm6, %xmm6
; SSE-NEXT: .LBB80_12:
-; SSE-NEXT: movq %xmm3, %rax
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_13
; SSE-NEXT: # BB#14:
-; SSE-NEXT: cvtsi2ssq %rax, %xmm7
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
; SSE-NEXT: jmp .LBB80_15
; SSE-NEXT: .LBB80_13:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: cvtsi2ssq %rax, %xmm7
-; SSE-NEXT: addss %xmm7, %xmm7
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: addss %xmm5, %xmm5
; SSE-NEXT: .LBB80_15:
-; SSE-NEXT: movq %xmm2, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_16
; SSE-NEXT: # BB#17:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm7
; SSE-NEXT: jmp .LBB80_18
; SSE-NEXT: .LBB80_16:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rax, %xmm1
-; SSE-NEXT: addss %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm7
+; SSE-NEXT: addss %xmm7, %xmm7
; SSE-NEXT: .LBB80_18:
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE-NEXT: movq %xmm3, %rax
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_19
; SSE-NEXT: # BB#20:
-; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB80_21
; SSE-NEXT: .LBB80_19:
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shrq %rcx
; SSE-NEXT: andl $1, %eax
; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rax, %xmm3
-; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB80_21:
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
@@ -4318,8 +4320,8 @@ define <8 x float> @uitofp_load_8i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB80_24:
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_8i64_to_8f32:
Modified: llvm/trunk/test/CodeGen/X86/vec_set.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_set.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_set.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_set.ll Sun Jun 4 15:12:04 2017
@@ -12,35 +12,35 @@ define void @test(<8 x i16>* %b, i16 %a0
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; X86-NEXT: movdqa %xmm3, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: test:
; X64: # BB#0:
-; X64-NEXT: movd %r8d, %xmm0
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: movd %edx, %xmm1
-; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %r9d, %xmm0
; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-NEXT: movd %r9d, %xmm2
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movd %r8d, %xmm1
+; X64-NEXT: movd %ecx, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: movd %edx, %xmm1
; X64-NEXT: movd %esi, %xmm3
-; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; X64-NEXT: movdqa %xmm3, (%rdi)
; X64-NEXT: retq
%tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
Modified: llvm/trunk/test/CodeGen/X86/vector-rem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-rem.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-rem.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-rem.ll Sun Jun 4 15:12:04 2017
@@ -11,9 +11,9 @@ define <4 x i32> @foo(<4 x i32> %t, <4 x
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm3, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; CHECK-NEXT: movd %xmm3, %ecx
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
@@ -24,15 +24,15 @@ define <4 x i32> @foo(<4 x i32> %t, <4 x
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %ecx
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movd %edx, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%m = srem <4 x i32> %t, %u
@@ -49,9 +49,9 @@ define <4 x i32> @bar(<4 x i32> %t, <4 x
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm3, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; CHECK-NEXT: movd %xmm3, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
@@ -62,15 +62,15 @@ define <4 x i32> @bar(<4 x i32> %t, <4 x
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movd %edx, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%m = urem <4 x i32> %t, %u
@@ -88,9 +88,9 @@ define <4 x float> @qux(<4 x float> %t,
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
@@ -100,15 +100,15 @@ define <4 x float> @qux(<4 x float> %t,
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: unpcklps (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: retq
%m = frem <4 x float> %t, %u
Modified: llvm/trunk/test/CodeGen/X86/vector-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-sext.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-sext.ll Sun Jun 4 15:12:04 2017
@@ -1333,19 +1333,19 @@ define <4 x i32> @load_sext_4i1_to_4i32(
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $62, %rcx
+; SSE2-NEXT: shlq $61, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $61, %rcx
+; SSE2-NEXT: shlq $62, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: shlq $63, %rax
; SSE2-NEXT: sarq $63, %rax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i1_to_4i32:
@@ -1356,19 +1356,19 @@ define <4 x i32> @load_sext_4i1_to_4i32(
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $62, %rcx
+; SSSE3-NEXT: shlq $61, %rcx
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $61, %rcx
+; SSSE3-NEXT: shlq $62, %rcx
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: shlq $63, %rax
; SSSE3-NEXT: sarq $63, %rax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i1_to_4i32:
@@ -1523,14 +1523,14 @@ define <4 x i64> @load_sext_4i1_to_4i64(
; SSE2-NEXT: shrl $3, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl %ecx
+; SSE2-NEXT: shrl $2, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: shrl $2, %eax
+; SSE2-NEXT: shrl %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSE2-NEXT: psllq $63, %xmm0
@@ -1549,14 +1549,14 @@ define <4 x i64> @load_sext_4i1_to_4i64(
; SSSE3-NEXT: shrl $3, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl %ecx
+; SSSE3-NEXT: shrl $2, %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: shrl $2, %eax
+; SSSE3-NEXT: shrl %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSSE3-NEXT: psllq $63, %xmm0
@@ -1813,7 +1813,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; SSE2-NEXT: shrq $7, %rcx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $60, %rcx
+; SSE2-NEXT: shlq $57, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -1822,13 +1822,13 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $62, %rcx
+; SSE2-NEXT: shlq $59, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $57, %rcx
+; SSE2-NEXT: shlq $60, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movq %rax, %rcx
@@ -1837,15 +1837,15 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $59, %rcx
+; SSE2-NEXT: shlq $62, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: movd %ecx, %xmm3
; SSE2-NEXT: shlq $63, %rax
; SSE2-NEXT: sarq $63, %rax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i1_to_8i16:
@@ -1855,7 +1855,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; SSSE3-NEXT: shrq $7, %rcx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $60, %rcx
+; SSSE3-NEXT: shlq $57, %rcx
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -1864,13 +1864,13 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $62, %rcx
+; SSSE3-NEXT: shlq $59, %rcx
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $57, %rcx
+; SSSE3-NEXT: shlq $60, %rcx
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movq %rax, %rcx
@@ -1879,15 +1879,15 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $59, %rcx
+; SSSE3-NEXT: shlq $62, %rcx
; SSSE3-NEXT: sarq $63, %rcx
; SSSE3-NEXT: movd %ecx, %xmm3
; SSSE3-NEXT: shlq $63, %rax
; SSSE3-NEXT: sarq $63, %rax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i1_to_8i16:
@@ -2191,7 +2191,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $6, %ecx
+; SSE2-NEXT: shrl $3, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
@@ -2203,30 +2203,30 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $4, %ecx
+; SSE2-NEXT: shrl %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $5, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl %ecx
+; SSE2-NEXT: shrl $4, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $3, %ecx
+; SSE2-NEXT: shrl $6, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: shrl $7, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $31, %xmm0
@@ -2240,7 +2240,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $6, %ecx
+; SSSE3-NEXT: shrl $3, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
@@ -2252,30 +2252,30 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $4, %ecx
+; SSSE3-NEXT: shrl %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $5, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl %ecx
+; SSSE3-NEXT: shrl $4, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $3, %ecx
+; SSSE3-NEXT: shrl $6, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: shrl $7, %eax
; SSSE3-NEXT: movzwl %ax, %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: pslld $31, %xmm0
@@ -2546,69 +2546,69 @@ define <16 x i8> @load_sext_16i1_to_16i8
; SSE2-NEXT: movq %rax, %rsi
; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: movq %rax, %rbp
-; SSE2-NEXT: shlq $49, %rbp
-; SSE2-NEXT: sarq $63, %rbp
+; SSE2-NEXT: shrq $15, %rbp
; SSE2-NEXT: movd %ebp, %xmm0
; SSE2-NEXT: movq %rax, %rbp
; SSE2-NEXT: movsbq %al, %rax
-; SSE2-NEXT: shlq $57, %r8
+; SSE2-NEXT: shlq $49, %r8
; SSE2-NEXT: sarq $63, %r8
; SSE2-NEXT: movd %r8d, %xmm1
-; SSE2-NEXT: shlq $53, %r9
+; SSE2-NEXT: shlq $50, %r9
; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: movd %r9d, %xmm2
-; SSE2-NEXT: shlq $61, %r10
+; SSE2-NEXT: shlq $51, %r10
; SSE2-NEXT: sarq $63, %r10
; SSE2-NEXT: movd %r10d, %xmm3
-; SSE2-NEXT: shlq $51, %r11
+; SSE2-NEXT: shlq $52, %r11
; SSE2-NEXT: sarq $63, %r11
; SSE2-NEXT: movd %r11d, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: shlq $59, %r14
+; SSE2-NEXT: shlq $53, %r14
; SSE2-NEXT: sarq $63, %r14
-; SSE2-NEXT: movd %r14d, %xmm5
+; SSE2-NEXT: movd %r14d, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: shlq $55, %r15
+; SSE2-NEXT: shlq $54, %r15
; SSE2-NEXT: sarq $63, %r15
; SSE2-NEXT: movd %r15d, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: shlq $63, %r12
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: shlq $55, %r12
; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: movd %r12d, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: shlq $50, %r13
+; SSE2-NEXT: movd %r12d, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: shlq $60, %r13
; SSE2-NEXT: sarq $63, %r13
-; SSE2-NEXT: movd %r13d, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: shlq $58, %rbx
+; SSE2-NEXT: movd %r13d, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: shlq $61, %rbx
; SSE2-NEXT: sarq $63, %rbx
; SSE2-NEXT: movd %ebx, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE2-NEXT: shlq $54, %rcx
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: shlq $62, %rcx
; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movd %ecx, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: shlq $62, %rdx
+; SSE2-NEXT: movd %ecx, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: shlq $63, %rdx
; SSE2-NEXT: sarq $63, %rdx
-; SSE2-NEXT: movd %edx, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: shlq $52, %rsi
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: shlq $58, %rsi
; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: movd %esi, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: shlq $60, %rdi
+; SSE2-NEXT: movd %esi, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT: shlq $59, %rdi
; SSE2-NEXT: sarq $63, %rdi
; SSE2-NEXT: movd %edi, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT: shrq $15, %rbp
-; SSE2-NEXT: movd %ebp, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT: shlq $57, %rbp
+; SSE2-NEXT: sarq $63, %rbp
+; SSE2-NEXT: movd %ebp, %xmm2
; SSE2-NEXT: shrq $7, %rax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -2640,69 +2640,69 @@ define <16 x i8> @load_sext_16i1_to_16i8
; SSSE3-NEXT: movq %rax, %rsi
; SSSE3-NEXT: movq %rax, %rdi
; SSSE3-NEXT: movq %rax, %rbp
-; SSSE3-NEXT: shlq $49, %rbp
-; SSSE3-NEXT: sarq $63, %rbp
+; SSSE3-NEXT: shrq $15, %rbp
; SSSE3-NEXT: movd %ebp, %xmm0
; SSSE3-NEXT: movq %rax, %rbp
; SSSE3-NEXT: movsbq %al, %rax
-; SSSE3-NEXT: shlq $57, %r8
+; SSSE3-NEXT: shlq $49, %r8
; SSSE3-NEXT: sarq $63, %r8
; SSSE3-NEXT: movd %r8d, %xmm1
-; SSSE3-NEXT: shlq $53, %r9
+; SSSE3-NEXT: shlq $50, %r9
; SSSE3-NEXT: sarq $63, %r9
; SSSE3-NEXT: movd %r9d, %xmm2
-; SSSE3-NEXT: shlq $61, %r10
+; SSSE3-NEXT: shlq $51, %r10
; SSSE3-NEXT: sarq $63, %r10
; SSSE3-NEXT: movd %r10d, %xmm3
-; SSSE3-NEXT: shlq $51, %r11
+; SSSE3-NEXT: shlq $52, %r11
; SSSE3-NEXT: sarq $63, %r11
; SSSE3-NEXT: movd %r11d, %xmm4
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: shlq $59, %r14
+; SSSE3-NEXT: shlq $53, %r14
; SSSE3-NEXT: sarq $63, %r14
-; SSSE3-NEXT: movd %r14d, %xmm5
+; SSSE3-NEXT: movd %r14d, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: shlq $55, %r15
+; SSSE3-NEXT: shlq $54, %r15
; SSSE3-NEXT: sarq $63, %r15
; SSSE3-NEXT: movd %r15d, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSSE3-NEXT: shlq $63, %r12
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT: shlq $55, %r12
; SSSE3-NEXT: sarq $63, %r12
-; SSSE3-NEXT: movd %r12d, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSSE3-NEXT: shlq $50, %r13
+; SSSE3-NEXT: movd %r12d, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT: shlq $60, %r13
; SSSE3-NEXT: sarq $63, %r13
-; SSSE3-NEXT: movd %r13d, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: shlq $58, %rbx
+; SSSE3-NEXT: movd %r13d, %xmm4
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: shlq $61, %rbx
; SSSE3-NEXT: sarq $63, %rbx
; SSSE3-NEXT: movd %ebx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSSE3-NEXT: shlq $54, %rcx
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: shlq $62, %rcx
; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: shlq $62, %rdx
+; SSSE3-NEXT: movd %ecx, %xmm5
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT: shlq $63, %rdx
; SSSE3-NEXT: sarq $63, %rdx
-; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: shlq $52, %rsi
+; SSSE3-NEXT: movd %edx, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT: shlq $58, %rsi
; SSSE3-NEXT: sarq $63, %rsi
-; SSSE3-NEXT: movd %esi, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSSE3-NEXT: shlq $60, %rdi
+; SSSE3-NEXT: movd %esi, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT: shlq $59, %rdi
; SSSE3-NEXT: sarq $63, %rdi
; SSSE3-NEXT: movd %edi, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSSE3-NEXT: shrq $15, %rbp
-; SSSE3-NEXT: movd %ebp, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT: shlq $57, %rbp
+; SSSE3-NEXT: sarq $63, %rbp
+; SSSE3-NEXT: movd %ebp, %xmm2
; SSSE3-NEXT: shrq $7, %rax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -3002,7 +3002,7 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $14, %ecx
+; SSE2-NEXT: shrl $7, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
@@ -3011,21 +3011,21 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $10, %ecx
+; SSE2-NEXT: shrl $5, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $2, %ecx
+; SSE2-NEXT: shrl $4, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $12, %ecx
+; SSE2-NEXT: shrl $3, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $4, %ecx
+; SSE2-NEXT: shrl $2, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
@@ -3033,18 +3033,18 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $8, %ecx
+; SSE2-NEXT: shrl %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $13, %ecx
+; SSE2-NEXT: shrl $11, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $5, %ecx
+; SSE2-NEXT: shrl $10, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -3053,31 +3053,31 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm3
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl %ecx
+; SSE2-NEXT: shrl $8, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $11, %ecx
+; SSE2-NEXT: shrl $13, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $3, %ecx
+; SSE2-NEXT: shrl $12, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $7, %ecx
+; SSE2-NEXT: shrl $14, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: shrl $15, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: movd %eax, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psllw $15, %xmm0
@@ -3091,7 +3091,7 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $14, %ecx
+; SSSE3-NEXT: shrl $7, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
@@ -3100,21 +3100,21 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $10, %ecx
+; SSSE3-NEXT: shrl $5, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $2, %ecx
+; SSSE3-NEXT: shrl $4, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $12, %ecx
+; SSSE3-NEXT: shrl $3, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $4, %ecx
+; SSSE3-NEXT: shrl $2, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
@@ -3122,18 +3122,18 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $8, %ecx
+; SSSE3-NEXT: shrl %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $13, %ecx
+; SSSE3-NEXT: shrl $11, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $5, %ecx
+; SSSE3-NEXT: shrl $10, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -3142,31 +3142,31 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm3
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl %ecx
+; SSSE3-NEXT: shrl $8, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $11, %ecx
+; SSSE3-NEXT: shrl $13, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $3, %ecx
+; SSSE3-NEXT: shrl $12, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $7, %ecx
+; SSSE3-NEXT: shrl $14, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: shrl $15, %eax
; SSSE3-NEXT: movzwl %ax, %eax
; SSSE3-NEXT: movd %eax, %xmm4
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psllw $15, %xmm0
@@ -3556,162 +3556,162 @@ define <32 x i8> @load_sext_32i1_to_32i8
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movswq (%rdi), %rbx
-; SSE2-NEXT: movq %rbx, %r10
-; SSE2-NEXT: movq %rbx, %r8
-; SSE2-NEXT: movq %rbx, %r9
-; SSE2-NEXT: movq %rbx, %r11
-; SSE2-NEXT: movq %rbx, %r14
-; SSE2-NEXT: movq %rbx, %r15
-; SSE2-NEXT: movq %rbx, %r12
-; SSE2-NEXT: movq %rbx, %r13
-; SSE2-NEXT: movq %rbx, %rdx
-; SSE2-NEXT: movq %rbx, %rsi
-; SSE2-NEXT: movq %rbx, %rcx
-; SSE2-NEXT: movq %rbx, %rbp
-; SSE2-NEXT: movq %rbx, %rax
-; SSE2-NEXT: shlq $49, %rax
-; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movq %rbx, %rax
-; SSE2-NEXT: shlq $57, %r10
+; SSE2-NEXT: movswq (%rdi), %rax
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: movq %rax, %r8
+; SSE2-NEXT: movq %rax, %r9
+; SSE2-NEXT: movq %rax, %r11
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: movq %rax, %r15
+; SSE2-NEXT: movq %rax, %r12
+; SSE2-NEXT: movq %rax, %r13
+; SSE2-NEXT: movq %rax, %rdx
+; SSE2-NEXT: movq %rax, %rsi
+; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: movq %rax, %rbp
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: shrq $15, %rbx
+; SSE2-NEXT: movd %ebx, %xmm0
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: shlq $49, %r10
; SSE2-NEXT: sarq $63, %r10
; SSE2-NEXT: movd %r10d, %xmm15
-; SSE2-NEXT: movq %rbx, %r10
-; SSE2-NEXT: movsbq %bl, %rbx
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: movsbq %al, %rax
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT: shlq $53, %r8
+; SSE2-NEXT: shlq $50, %r8
; SSE2-NEXT: sarq $63, %r8
; SSE2-NEXT: movd %r8d, %xmm8
-; SSE2-NEXT: shlq $61, %r9
+; SSE2-NEXT: shlq $51, %r9
; SSE2-NEXT: sarq $63, %r9
-; SSE2-NEXT: movd %r9d, %xmm2
-; SSE2-NEXT: shlq $51, %r11
+; SSE2-NEXT: movd %r9d, %xmm3
+; SSE2-NEXT: shlq $52, %r11
; SSE2-NEXT: sarq $63, %r11
; SSE2-NEXT: movd %r11d, %xmm9
-; SSE2-NEXT: shlq $59, %r14
+; SSE2-NEXT: shlq $53, %r14
; SSE2-NEXT: sarq $63, %r14
-; SSE2-NEXT: movd %r14d, %xmm5
-; SSE2-NEXT: shlq $55, %r15
+; SSE2-NEXT: movd %r14d, %xmm6
+; SSE2-NEXT: shlq $54, %r15
; SSE2-NEXT: sarq $63, %r15
; SSE2-NEXT: movd %r15d, %xmm10
-; SSE2-NEXT: shlq $63, %r12
+; SSE2-NEXT: shlq $55, %r12
; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: movd %r12d, %xmm0
-; SSE2-NEXT: shlq $50, %r13
+; SSE2-NEXT: movd %r12d, %xmm2
+; SSE2-NEXT: shlq $60, %r13
; SSE2-NEXT: sarq $63, %r13
; SSE2-NEXT: movd %r13d, %xmm11
-; SSE2-NEXT: shlq $58, %rdx
+; SSE2-NEXT: shlq $61, %rdx
; SSE2-NEXT: sarq $63, %rdx
-; SSE2-NEXT: movd %edx, %xmm4
-; SSE2-NEXT: shlq $54, %rsi
+; SSE2-NEXT: movd %edx, %xmm5
+; SSE2-NEXT: shlq $62, %rsi
; SSE2-NEXT: sarq $63, %rsi
; SSE2-NEXT: movd %esi, %xmm12
-; SSE2-NEXT: shlq $62, %rcx
+; SSE2-NEXT: shlq $63, %rcx
; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movd %ecx, %xmm6
-; SSE2-NEXT: shlq $52, %rbp
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: shlq $58, %rbp
; SSE2-NEXT: sarq $63, %rbp
; SSE2-NEXT: movd %ebp, %xmm13
-; SSE2-NEXT: shlq $60, %rax
-; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movd %eax, %xmm7
-; SSE2-NEXT: shrq $15, %r10
-; SSE2-NEXT: movd %r10d, %xmm14
-; SSE2-NEXT: shrq $7, %rbx
-; SSE2-NEXT: movd %ebx, %xmm3
-; SSE2-NEXT: movswq 2(%rdi), %rdx
-; SSE2-NEXT: movq %rdx, %r8
-; SSE2-NEXT: movq %rdx, %r9
-; SSE2-NEXT: movq %rdx, %r10
-; SSE2-NEXT: movq %rdx, %r11
-; SSE2-NEXT: movq %rdx, %r14
-; SSE2-NEXT: movq %rdx, %r15
-; SSE2-NEXT: movq %rdx, %r12
-; SSE2-NEXT: movq %rdx, %r13
-; SSE2-NEXT: movq %rdx, %rbx
-; SSE2-NEXT: movq %rdx, %rax
-; SSE2-NEXT: movq %rdx, %rcx
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rdx, %rdi
-; SSE2-NEXT: movq %rdx, %rbp
-; SSE2-NEXT: shlq $49, %rbp
-; SSE2-NEXT: sarq $63, %rbp
+; SSE2-NEXT: shlq $59, %rbx
+; SSE2-NEXT: sarq $63, %rbx
+; SSE2-NEXT: movd %ebx, %xmm7
+; SSE2-NEXT: shlq $57, %r10
+; SSE2-NEXT: sarq $63, %r10
+; SSE2-NEXT: movd %r10d, %xmm4
+; SSE2-NEXT: shrq $7, %rax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movswq 2(%rdi), %rsi
+; SSE2-NEXT: movq %rsi, %r8
+; SSE2-NEXT: movq %rsi, %r9
+; SSE2-NEXT: movq %rsi, %r10
+; SSE2-NEXT: movq %rsi, %r11
+; SSE2-NEXT: movq %rsi, %r14
+; SSE2-NEXT: movq %rsi, %r15
+; SSE2-NEXT: movq %rsi, %r12
+; SSE2-NEXT: movq %rsi, %r13
+; SSE2-NEXT: movq %rsi, %rbx
+; SSE2-NEXT: movq %rsi, %rax
+; SSE2-NEXT: movq %rsi, %rcx
+; SSE2-NEXT: movq %rsi, %rdx
+; SSE2-NEXT: movq %rsi, %rdi
+; SSE2-NEXT: movq %rsi, %rbp
+; SSE2-NEXT: shrq $15, %rbp
; SSE2-NEXT: movd %ebp, %xmm1
-; SSE2-NEXT: movq %rdx, %rbp
-; SSE2-NEXT: movsbq %dl, %rdx
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: movq %rsi, %rbp
+; SSE2-NEXT: movsbq %sil, %rsi
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT: shlq $57, %r8
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT: shlq $49, %r8
; SSE2-NEXT: sarq $63, %r8
-; SSE2-NEXT: movd %r8d, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; SSE2-NEXT: shlq $53, %r9
+; SSE2-NEXT: movd %r8d, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT: shlq $50, %r9
; SSE2-NEXT: sarq $63, %r9
-; SSE2-NEXT: movd %r9d, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSE2-NEXT: shlq $61, %r10
+; SSE2-NEXT: movd %r9d, %xmm4
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-NEXT: shlq $51, %r10
; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: movd %r10d, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE2-NEXT: shlq $51, %r11
+; SSE2-NEXT: movd %r10d, %xmm5
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: shlq $52, %r11
; SSE2-NEXT: sarq $63, %r11
-; SSE2-NEXT: movd %r11d, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: shlq $59, %r14
+; SSE2-NEXT: movd %r11d, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: shlq $53, %r14
; SSE2-NEXT: sarq $63, %r14
-; SSE2-NEXT: movd %r14d, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-NEXT: shlq $55, %r15
+; SSE2-NEXT: movd %r14d, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT: shlq $54, %r15
; SSE2-NEXT: sarq $63, %r15
-; SSE2-NEXT: movd %r15d, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE2-NEXT: shlq $63, %r12
+; SSE2-NEXT: movd %r15d, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-NEXT: shlq $55, %r12
; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: movd %r12d, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT: shlq $50, %r13
+; SSE2-NEXT: movd %r12d, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: shlq $60, %r13
; SSE2-NEXT: sarq $63, %r13
; SSE2-NEXT: movd %r13d, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: shlq $58, %rbx
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: shlq $61, %rbx
; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movd %ebx, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE2-NEXT: shlq $54, %rax
+; SSE2-NEXT: movd %ebx, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: shlq $62, %rax
; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movd %eax, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: shlq $62, %rcx
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT: shlq $63, %rcx
; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movd %ecx, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: shlq $52, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: movd %esi, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-NEXT: shlq $60, %rdi
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: shlq $58, %rdx
+; SSE2-NEXT: sarq $63, %rdx
+; SSE2-NEXT: movd %edx, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: shlq $59, %rdi
; SSE2-NEXT: sarq $63, %rdi
-; SSE2-NEXT: movd %edi, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: shrq $15, %rbp
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: shlq $57, %rbp
+; SSE2-NEXT: sarq $63, %rbp
; SSE2-NEXT: movd %ebp, %xmm2
-; SSE2-NEXT: shrq $7, %rdx
-; SSE2-NEXT: movd %edx, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: shrq $7, %rsi
+; SSE2-NEXT: movd %esi, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -3728,162 +3728,162 @@ define <32 x i8> @load_sext_32i1_to_32i8
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movswq (%rdi), %rbx
-; SSSE3-NEXT: movq %rbx, %r10
-; SSSE3-NEXT: movq %rbx, %r8
-; SSSE3-NEXT: movq %rbx, %r9
-; SSSE3-NEXT: movq %rbx, %r11
-; SSSE3-NEXT: movq %rbx, %r14
-; SSSE3-NEXT: movq %rbx, %r15
-; SSSE3-NEXT: movq %rbx, %r12
-; SSSE3-NEXT: movq %rbx, %r13
-; SSSE3-NEXT: movq %rbx, %rdx
-; SSSE3-NEXT: movq %rbx, %rsi
-; SSSE3-NEXT: movq %rbx, %rcx
-; SSSE3-NEXT: movq %rbx, %rbp
-; SSSE3-NEXT: movq %rbx, %rax
-; SSSE3-NEXT: shlq $49, %rax
-; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movq %rbx, %rax
-; SSSE3-NEXT: shlq $57, %r10
+; SSSE3-NEXT: movswq (%rdi), %rax
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: movq %rax, %r8
+; SSSE3-NEXT: movq %rax, %r9
+; SSSE3-NEXT: movq %rax, %r11
+; SSSE3-NEXT: movq %rax, %r14
+; SSSE3-NEXT: movq %rax, %r15
+; SSSE3-NEXT: movq %rax, %r12
+; SSSE3-NEXT: movq %rax, %r13
+; SSSE3-NEXT: movq %rax, %rdx
+; SSSE3-NEXT: movq %rax, %rsi
+; SSSE3-NEXT: movq %rax, %rcx
+; SSSE3-NEXT: movq %rax, %rbp
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: shrq $15, %rbx
+; SSSE3-NEXT: movd %ebx, %xmm0
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: shlq $49, %r10
; SSSE3-NEXT: sarq $63, %r10
; SSSE3-NEXT: movd %r10d, %xmm15
-; SSSE3-NEXT: movq %rbx, %r10
-; SSSE3-NEXT: movsbq %bl, %rbx
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: movsbq %al, %rax
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSSE3-NEXT: shlq $53, %r8
+; SSSE3-NEXT: shlq $50, %r8
; SSSE3-NEXT: sarq $63, %r8
; SSSE3-NEXT: movd %r8d, %xmm8
-; SSSE3-NEXT: shlq $61, %r9
+; SSSE3-NEXT: shlq $51, %r9
; SSSE3-NEXT: sarq $63, %r9
-; SSSE3-NEXT: movd %r9d, %xmm2
-; SSSE3-NEXT: shlq $51, %r11
+; SSSE3-NEXT: movd %r9d, %xmm3
+; SSSE3-NEXT: shlq $52, %r11
; SSSE3-NEXT: sarq $63, %r11
; SSSE3-NEXT: movd %r11d, %xmm9
-; SSSE3-NEXT: shlq $59, %r14
+; SSSE3-NEXT: shlq $53, %r14
; SSSE3-NEXT: sarq $63, %r14
-; SSSE3-NEXT: movd %r14d, %xmm5
-; SSSE3-NEXT: shlq $55, %r15
+; SSSE3-NEXT: movd %r14d, %xmm6
+; SSSE3-NEXT: shlq $54, %r15
; SSSE3-NEXT: sarq $63, %r15
; SSSE3-NEXT: movd %r15d, %xmm10
-; SSSE3-NEXT: shlq $63, %r12
+; SSSE3-NEXT: shlq $55, %r12
; SSSE3-NEXT: sarq $63, %r12
-; SSSE3-NEXT: movd %r12d, %xmm0
-; SSSE3-NEXT: shlq $50, %r13
+; SSSE3-NEXT: movd %r12d, %xmm2
+; SSSE3-NEXT: shlq $60, %r13
; SSSE3-NEXT: sarq $63, %r13
; SSSE3-NEXT: movd %r13d, %xmm11
-; SSSE3-NEXT: shlq $58, %rdx
+; SSSE3-NEXT: shlq $61, %rdx
; SSSE3-NEXT: sarq $63, %rdx
-; SSSE3-NEXT: movd %edx, %xmm4
-; SSSE3-NEXT: shlq $54, %rsi
+; SSSE3-NEXT: movd %edx, %xmm5
+; SSSE3-NEXT: shlq $62, %rsi
; SSSE3-NEXT: sarq $63, %rsi
; SSSE3-NEXT: movd %esi, %xmm12
-; SSSE3-NEXT: shlq $62, %rcx
+; SSSE3-NEXT: shlq $63, %rcx
; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm6
-; SSSE3-NEXT: shlq $52, %rbp
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: shlq $58, %rbp
; SSSE3-NEXT: sarq $63, %rbp
; SSSE3-NEXT: movd %ebp, %xmm13
-; SSSE3-NEXT: shlq $60, %rax
-; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movd %eax, %xmm7
-; SSSE3-NEXT: shrq $15, %r10
-; SSSE3-NEXT: movd %r10d, %xmm14
-; SSSE3-NEXT: shrq $7, %rbx
-; SSSE3-NEXT: movd %ebx, %xmm3
-; SSSE3-NEXT: movswq 2(%rdi), %rdx
-; SSSE3-NEXT: movq %rdx, %r8
-; SSSE3-NEXT: movq %rdx, %r9
-; SSSE3-NEXT: movq %rdx, %r10
-; SSSE3-NEXT: movq %rdx, %r11
-; SSSE3-NEXT: movq %rdx, %r14
-; SSSE3-NEXT: movq %rdx, %r15
-; SSSE3-NEXT: movq %rdx, %r12
-; SSSE3-NEXT: movq %rdx, %r13
-; SSSE3-NEXT: movq %rdx, %rbx
-; SSSE3-NEXT: movq %rdx, %rax
-; SSSE3-NEXT: movq %rdx, %rcx
-; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rdx, %rdi
-; SSSE3-NEXT: movq %rdx, %rbp
-; SSSE3-NEXT: shlq $49, %rbp
-; SSSE3-NEXT: sarq $63, %rbp
+; SSSE3-NEXT: shlq $59, %rbx
+; SSSE3-NEXT: sarq $63, %rbx
+; SSSE3-NEXT: movd %ebx, %xmm7
+; SSSE3-NEXT: shlq $57, %r10
+; SSSE3-NEXT: sarq $63, %r10
+; SSSE3-NEXT: movd %r10d, %xmm4
+; SSSE3-NEXT: shrq $7, %rax
+; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movswq 2(%rdi), %rsi
+; SSSE3-NEXT: movq %rsi, %r8
+; SSSE3-NEXT: movq %rsi, %r9
+; SSSE3-NEXT: movq %rsi, %r10
+; SSSE3-NEXT: movq %rsi, %r11
+; SSSE3-NEXT: movq %rsi, %r14
+; SSSE3-NEXT: movq %rsi, %r15
+; SSSE3-NEXT: movq %rsi, %r12
+; SSSE3-NEXT: movq %rsi, %r13
+; SSSE3-NEXT: movq %rsi, %rbx
+; SSSE3-NEXT: movq %rsi, %rax
+; SSSE3-NEXT: movq %rsi, %rcx
+; SSSE3-NEXT: movq %rsi, %rdx
+; SSSE3-NEXT: movq %rsi, %rdi
+; SSSE3-NEXT: movq %rsi, %rbp
+; SSSE3-NEXT: shrq $15, %rbp
; SSSE3-NEXT: movd %ebp, %xmm1
-; SSSE3-NEXT: movq %rdx, %rbp
-; SSSE3-NEXT: movsbq %dl, %rdx
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSSE3-NEXT: movq %rsi, %rbp
+; SSSE3-NEXT: movsbq %sil, %rsi
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSSE3-NEXT: shlq $57, %r8
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSSE3-NEXT: shlq $49, %r8
; SSSE3-NEXT: sarq $63, %r8
-; SSSE3-NEXT: movd %r8d, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; SSSE3-NEXT: shlq $53, %r9
+; SSSE3-NEXT: movd %r8d, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSSE3-NEXT: shlq $50, %r9
; SSSE3-NEXT: sarq $63, %r9
-; SSSE3-NEXT: movd %r9d, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSSE3-NEXT: shlq $61, %r10
+; SSSE3-NEXT: movd %r9d, %xmm4
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSSE3-NEXT: shlq $51, %r10
; SSSE3-NEXT: sarq $63, %r10
-; SSSE3-NEXT: movd %r10d, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSSE3-NEXT: shlq $51, %r11
+; SSSE3-NEXT: movd %r10d, %xmm5
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: shlq $52, %r11
; SSSE3-NEXT: sarq $63, %r11
-; SSSE3-NEXT: movd %r11d, %xmm5
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: shlq $59, %r14
+; SSSE3-NEXT: movd %r11d, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT: shlq $53, %r14
; SSSE3-NEXT: sarq $63, %r14
-; SSSE3-NEXT: movd %r14d, %xmm6
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSSE3-NEXT: shlq $55, %r15
+; SSSE3-NEXT: movd %r14d, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSSE3-NEXT: shlq $54, %r15
; SSSE3-NEXT: sarq $63, %r15
-; SSSE3-NEXT: movd %r15d, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSSE3-NEXT: shlq $63, %r12
+; SSSE3-NEXT: movd %r15d, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSSE3-NEXT: shlq $55, %r12
; SSSE3-NEXT: sarq $63, %r12
-; SSSE3-NEXT: movd %r12d, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSSE3-NEXT: shlq $50, %r13
+; SSSE3-NEXT: movd %r12d, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: shlq $60, %r13
; SSSE3-NEXT: sarq $63, %r13
; SSSE3-NEXT: movd %r13d, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: shlq $58, %rbx
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT: shlq $61, %rbx
; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movd %ebx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSSE3-NEXT: shlq $54, %rax
+; SSSE3-NEXT: movd %ebx, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT: shlq $62, %rax
; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movd %eax, %xmm5
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT: shlq $62, %rcx
+; SSSE3-NEXT: movd %eax, %xmm6
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSSE3-NEXT: shlq $63, %rcx
; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: shlq $52, %rsi
-; SSSE3-NEXT: sarq $63, %rsi
-; SSSE3-NEXT: movd %esi, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSSE3-NEXT: shlq $60, %rdi
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT: shlq $58, %rdx
+; SSSE3-NEXT: sarq $63, %rdx
+; SSSE3-NEXT: movd %edx, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT: shlq $59, %rdi
; SSSE3-NEXT: sarq $63, %rdi
-; SSSE3-NEXT: movd %edi, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: shrq $15, %rbp
+; SSSE3-NEXT: movd %edi, %xmm4
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT: shlq $57, %rbp
+; SSSE3-NEXT: sarq $63, %rbp
; SSSE3-NEXT: movd %ebp, %xmm2
-; SSSE3-NEXT: shrq $7, %rdx
-; SSSE3-NEXT: movd %edx, %xmm5
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT: shrq $7, %rsi
+; SSSE3-NEXT: movd %esi, %xmm5
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll Sun Jun 4 15:12:04 2017
@@ -83,7 +83,7 @@ define <4 x float> @var_shuffle_v4f32_v4
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
@@ -103,7 +103,7 @@ define <4 x float> @var_shuffle_v4f32_v4
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
@@ -168,7 +168,7 @@ define <4 x i32> @var_shuffle_v4i32_v4i3
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
@@ -188,7 +188,7 @@ define <4 x i32> @var_shuffle_v4i32_v4i3
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
@@ -257,27 +257,27 @@ define <8 x i16> @var_shuffle_v8i16_v8i1
; SSE2-NEXT: andl $7, %eax
; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
@@ -301,27 +301,27 @@ define <8 x i16> @var_shuffle_v8i16_v8i1
; SSSE3-NEXT: andl $7, %eax
; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
@@ -425,67 +425,67 @@ define <16 x i8> @var_shuffle_v16i8_v16i
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
; SSE2-NEXT: movd %eax, %xmm9
-; SSE2-NEXT: andl $15, %ecx
-; SSE2-NEXT: movzbl (%rcx,%r10), %eax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movzbl (%rax,%r10), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: andl $15, %r9d
-; SSE2-NEXT: movzbl (%r9,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm7
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm11
-; SSE2-NEXT: andl $15, %esi
-; SSE2-NEXT: movzbl (%rsi,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movd %eax, %xmm7
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: movd %eax, %xmm11
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movd %eax, %xmm6
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm13
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl (%rdx,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd %eax, %xmm12
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl (%rax,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: andl $15, %r9d
+; SSE2-NEXT: movzbl (%r9,%r10), %eax
+; SSE2-NEXT: movd %eax, %xmm13
; SSE2-NEXT: andl $15, %r8d
; SSE2-NEXT: movzbl (%r8,%r10), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl (%rcx,%r10), %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: andl $15, %edx
+; SSE2-NEXT: movzbl (%rdx,%r10), %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: andl $15, %eax
-; SSE2-NEXT: movzbl (%rax,%r10), %eax
+; SSE2-NEXT: andl $15, %esi
+; SSE2-NEXT: movzbl (%rsi,%r10), %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: andl $15, %edi
; SSE2-NEXT: movzbl (%rdi,%r10), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -510,67 +510,67 @@ define <16 x i8> @var_shuffle_v16i8_v16i
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm9
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%r10), %eax
+; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: andl $15, %eax
+; SSSE3-NEXT: movzbl (%rax,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm10
-; SSSE3-NEXT: andl $15, %r9d
-; SSSE3-NEXT: movzbl (%r9,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm7
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm11
-; SSSE3-NEXT: andl $15, %esi
-; SSSE3-NEXT: movzbl (%rsi,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm6
+; SSSE3-NEXT: movd %eax, %xmm7
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: movd %eax, %xmm11
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: movd %eax, %xmm6
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm13
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl (%rdx,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movd %eax, %xmm12
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl (%rax,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: andl $15, %r9d
+; SSSE3-NEXT: movzbl (%r9,%r10), %eax
+; SSSE3-NEXT: movd %eax, %xmm13
; SSSE3-NEXT: andl $15, %r8d
; SSSE3-NEXT: movzbl (%r8,%r10), %eax
+; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl (%rcx,%r10), %eax
+; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: andl $15, %edx
+; SSSE3-NEXT: movzbl (%rdx,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: andl $15, %eax
-; SSSE3-NEXT: movzbl (%rax,%r10), %eax
+; SSSE3-NEXT: andl $15, %esi
+; SSSE3-NEXT: movzbl (%rsi,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: andl $15, %edi
; SSSE3-NEXT: movzbl (%rdi,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -739,7 +739,7 @@ define <4 x i32> @mem_shuffle_v4i32_v4i3
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
@@ -759,7 +759,7 @@ define <4 x i32> @mem_shuffle_v4i32_v4i3
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
@@ -824,23 +824,23 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm8
-; SSE2-NEXT: movzbl 7(%rdi), %edx
+; SSE2-NEXT: movzbl 14(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm15
-; SSE2-NEXT: movzbl 11(%rdi), %edx
+; SSE2-NEXT: movzbl 13(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm9
-; SSE2-NEXT: movzbl 3(%rdi), %edx
+; SSE2-NEXT: movzbl 12(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm3
-; SSE2-NEXT: movzbl 13(%rdi), %edx
+; SSE2-NEXT: movzbl 11(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm10
-; SSE2-NEXT: movzbl 5(%rdi), %edx
+; SSE2-NEXT: movzbl 10(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm7
@@ -848,11 +848,11 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm11
-; SSE2-NEXT: movzbl 1(%rdi), %edx
+; SSE2-NEXT: movzbl 8(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm6
-; SSE2-NEXT: movzbl 14(%rdi), %edx
+; SSE2-NEXT: movzbl 7(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm12
@@ -860,23 +860,23 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm5
-; SSE2-NEXT: movzbl 10(%rdi), %edx
+; SSE2-NEXT: movzbl 5(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm13
-; SSE2-NEXT: movzbl 2(%rdi), %edx
+; SSE2-NEXT: movzbl 4(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm4
-; SSE2-NEXT: movzbl 12(%rdi), %edx
+; SSE2-NEXT: movzbl 3(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm14
-; SSE2-NEXT: movzbl 4(%rdi), %edx
+; SSE2-NEXT: movzbl 2(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm1
-; SSE2-NEXT: movzbl 8(%rdi), %edx
+; SSE2-NEXT: movzbl 1(%rdi), %edx
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl (%rdx,%rcx), %edx
; SSE2-NEXT: movd %edx, %xmm2
@@ -885,19 +885,19 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -909,23 +909,23 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm8
-; SSSE3-NEXT: movzbl 7(%rdi), %edx
+; SSSE3-NEXT: movzbl 14(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm15
-; SSSE3-NEXT: movzbl 11(%rdi), %edx
+; SSSE3-NEXT: movzbl 13(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm9
-; SSSE3-NEXT: movzbl 3(%rdi), %edx
+; SSSE3-NEXT: movzbl 12(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: movzbl 13(%rdi), %edx
+; SSSE3-NEXT: movzbl 11(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm10
-; SSSE3-NEXT: movzbl 5(%rdi), %edx
+; SSSE3-NEXT: movzbl 10(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm7
@@ -933,11 +933,11 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm11
-; SSSE3-NEXT: movzbl 1(%rdi), %edx
+; SSSE3-NEXT: movzbl 8(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm6
-; SSSE3-NEXT: movzbl 14(%rdi), %edx
+; SSSE3-NEXT: movzbl 7(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm12
@@ -945,23 +945,23 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm5
-; SSSE3-NEXT: movzbl 10(%rdi), %edx
+; SSSE3-NEXT: movzbl 5(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm13
-; SSSE3-NEXT: movzbl 2(%rdi), %edx
+; SSSE3-NEXT: movzbl 4(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm4
-; SSSE3-NEXT: movzbl 12(%rdi), %edx
+; SSSE3-NEXT: movzbl 3(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm14
-; SSSE3-NEXT: movzbl 4(%rdi), %edx
+; SSSE3-NEXT: movzbl 2(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: movzbl 8(%rdi), %edx
+; SSSE3-NEXT: movzbl 1(%rdi), %edx
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx
; SSSE3-NEXT: movd %edx, %xmm2
@@ -970,19 +970,19 @@ define <16 x i8> @mem_shuffle_v16i8_v16i
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -1225,28 +1225,27 @@ define <8 x i16> @var_shuffle_v8i16_v8i1
; SSE2-NEXT: andl $7, %ecx
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: andl $7, %r8d
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: andl $7, %r9d
; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: movzwl -40(%rsp,%rdx,2), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax
; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
@@ -1263,28 +1262,27 @@ define <8 x i16> @var_shuffle_v8i16_v8i1
; SSSE3-NEXT: andl $7, %ecx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $7, %r8d
-; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $7, %r9d
; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSSE3-NEXT: movzwl -40(%rsp,%rdx,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax
; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
Modified: llvm/trunk/test/CodeGen/X86/vshift-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vshift-1.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vshift-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vshift-1.ll Sun Jun 4 15:12:04 2017
@@ -28,12 +28,9 @@ define void @shift1b(<2 x i64> %val, <2
; X32-LABEL: shift1b:
; X32: # BB#0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X32-NEXT: psllq %xmm2, %xmm0
+; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-NEXT: psllq %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/vshift-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vshift-2.ll?rev=304688&r1=304687&r2=304688&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vshift-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vshift-2.ll Sun Jun 4 15:12:04 2017
@@ -28,12 +28,9 @@ define void @shift1b(<2 x i64> %val, <2
; X32-LABEL: shift1b:
; X32: # BB#0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X32-NEXT: psrlq %xmm2, %xmm0
+; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-NEXT: psrlq %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
More information about the llvm-commits
mailing list