[llvm] 6756947 - [X86] lowerV8I16Shuffle - use PACKSS(SEXT_INREG(X),SEXT_INREG(Y)) for pre-SSSE3 truncation shuffles
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 26 08:57:22 PDT 2023
Author: Simon Pilgrim
Date: 2023-06-26T16:50:13+01:00
New Revision: 6756947ac6ef35f774817995c3e052ad48c83144
URL: https://github.com/llvm/llvm-project/commit/6756947ac6ef35f774817995c3e052ad48c83144
DIFF: https://github.com/llvm/llvm-project/commit/6756947ac6ef35f774817995c3e052ad48c83144.diff
LOG: [X86] lowerV8I16Shuffle - use PACKSS(SEXT_INREG(X),SEXT_INREG(Y)) for pre-SSSE3 truncation shuffles
The comment about PSHUFLW+PSHUFHW+PSHUFD was outdated as that referred to a single input case, but that is now always handled earlier.
Another step towards removing premature combines to vector truncation combines to PACK.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/haddsub-shuf.ll
llvm/test/CodeGen/X86/vec_fp_to_int.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7adcf4704871d..888a2773357ba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16509,12 +16509,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
- // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
- // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
- if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
+ if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
!Subtarget.hasVLX()) {
// Check if this is part of a 256-bit vector truncation.
+ unsigned PackOpc = 0;
if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
@@ -16525,7 +16524,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
V1 = extract128BitVector(V1V2, 0, DAG, DL);
V2 = extract128BitVector(V1V2, 4, DAG, DL);
- } else {
+ PackOpc = X86ISD::PACKUS;
+ } else if (Subtarget.hasSSE41()) {
SmallVector<SDValue, 4> DWordClearOps(4,
DAG.getConstant(0, DL, MVT::i32));
for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
@@ -16536,14 +16536,26 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DWordClearMask);
V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
DWordClearMask);
+ PackOpc = X86ISD::PACKUS;
+ } else if (!Subtarget.hasSSSE3()) {
+ SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
+ V1 = DAG.getBitcast(MVT::v4i32, V1);
+ V2 = DAG.getBitcast(MVT::v4i32, V2);
+ V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
+ V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
+ V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
+ V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
+ PackOpc = X86ISD::PACKSS;
+ }
+ if (PackOpc) {
+ // Now pack things back together.
+ SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
+ if (NumEvenDrops == 2) {
+ Result = DAG.getBitcast(MVT::v4i32, Result);
+ Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
+ }
+ return Result;
}
- // Now pack things back together.
- SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
- if (NumEvenDrops == 2) {
- Result = DAG.getBitcast(MVT::v4i32, Result);
- Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
- }
- return Result;
}
// When compacting odd (upper) elements, use PACKSS pre-SSE41.
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index d8abe595de8ba..364ad953a11d4 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -715,13 +715,13 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
; SSE3-LABEL: hadd_v16i16a:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE3-NEXT: movdqa %xmm1, %xmm3
+; SSE3-NEXT: pslld $16, %xmm3
+; SSE3-NEXT: psrad $16, %xmm3
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pslld $16, %xmm2
+; SSE3-NEXT: psrad $16, %xmm2
+; SSE3-NEXT: packssdw %xmm3, %xmm2
; SSE3-NEXT: psrad $16, %xmm1
; SSE3-NEXT: psrad $16, %xmm0
; SSE3-NEXT: packssdw %xmm1, %xmm0
@@ -855,13 +855,13 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
; SSE3-LABEL: hsub_v16i16a:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE3-NEXT: movdqa %xmm1, %xmm3
+; SSE3-NEXT: pslld $16, %xmm3
+; SSE3-NEXT: psrad $16, %xmm3
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pslld $16, %xmm2
+; SSE3-NEXT: psrad $16, %xmm2
+; SSE3-NEXT: packssdw %xmm3, %xmm2
; SSE3-NEXT: psrad $16, %xmm1
; SSE3-NEXT: psrad $16, %xmm0
; SSE3-NEXT: packssdw %xmm1, %xmm0
@@ -1367,20 +1367,20 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
; SSE3-LABEL: hadd_16i16_16i16_shuffle:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE3-NEXT: movdqa %xmm3, %xmm5
+; SSE3-NEXT: pslld $16, %xmm5
+; SSE3-NEXT: psrad $16, %xmm5
+; SSE3-NEXT: movdqa %xmm2, %xmm4
+; SSE3-NEXT: pslld $16, %xmm4
+; SSE3-NEXT: psrad $16, %xmm4
+; SSE3-NEXT: packssdw %xmm5, %xmm4
+; SSE3-NEXT: movdqa %xmm1, %xmm5
+; SSE3-NEXT: pslld $16, %xmm5
+; SSE3-NEXT: psrad $16, %xmm5
+; SSE3-NEXT: movdqa %xmm0, %xmm6
+; SSE3-NEXT: pslld $16, %xmm6
+; SSE3-NEXT: psrad $16, %xmm6
+; SSE3-NEXT: packssdw %xmm5, %xmm6
; SSE3-NEXT: psrad $16, %xmm3
; SSE3-NEXT: psrad $16, %xmm2
; SSE3-NEXT: packssdw %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index 4f7a4676390f8..a49f7e9909760 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2475,16 +2475,14 @@ define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_8f64_to_8i16:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 3ce186debddca..28241da6506a9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -85,13 +85,13 @@ define void @load_i16_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pslld $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pslld $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: packssdw %xmm2, %xmm3
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
@@ -137,20 +137,20 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa 32(%rdi), %xmm1
; SSE-NEXT: movdqa 48(%rdi), %xmm3
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pslld $16, %xmm4
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: pslld $16, %xmm5
+; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: packssdw %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pslld $16, %xmm4
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pslld $16, %xmm6
+; SSE-NEXT: psrad $16, %xmm6
+; SSE-NEXT: packssdw %xmm4, %xmm6
; SSE-NEXT: psrad $16, %xmm3
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: packssdw %xmm3, %xmm1
@@ -270,34 +270,34 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movdqa 16(%rdi), %xmm7
; SSE-NEXT: movdqa 32(%rdi), %xmm3
; SSE-NEXT: movdqa 48(%rdi), %xmm9
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm11[0]
+; SSE-NEXT: movdqa %xmm9, %xmm8
+; SSE-NEXT: pslld $16, %xmm8
+; SSE-NEXT: psrad $16, %xmm8
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: pslld $16, %xmm5
+; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: packssdw %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm7, %xmm10
+; SSE-NEXT: pslld $16, %xmm10
+; SSE-NEXT: psrad $16, %xmm10
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: pslld $16, %xmm8
+; SSE-NEXT: psrad $16, %xmm8
+; SSE-NEXT: packssdw %xmm10, %xmm8
+; SSE-NEXT: movdqa %xmm6, %xmm11
+; SSE-NEXT: pslld $16, %xmm11
+; SSE-NEXT: psrad $16, %xmm11
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: pslld $16, %xmm10
+; SSE-NEXT: psrad $16, %xmm10
+; SSE-NEXT: packssdw %xmm11, %xmm10
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: pslld $16, %xmm11
+; SSE-NEXT: psrad $16, %xmm11
+; SSE-NEXT: movdqa %xmm0, %xmm12
+; SSE-NEXT: pslld $16, %xmm12
+; SSE-NEXT: psrad $16, %xmm12
+; SSE-NEXT: packssdw %xmm11, %xmm12
; SSE-NEXT: psrad $16, %xmm9
; SSE-NEXT: psrad $16, %xmm3
; SSE-NEXT: packssdw %xmm9, %xmm3
@@ -500,138 +500,139 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i16_stride2_vf64:
; SSE: # %bb.0:
-; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movdqa 160(%rdi), %xmm11
-; SSE-NEXT: movdqa 176(%rdi), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 64(%rdi), %xmm13
-; SSE-NEXT: movdqa 80(%rdi), %xmm4
+; SSE-NEXT: subq $40, %rsp
+; SSE-NEXT: movdqa 96(%rdi), %xmm13
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 112(%rdi), %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 96(%rdi), %xmm9
-; SSE-NEXT: movdqa 112(%rdi), %xmm3
+; SSE-NEXT: movdqa 128(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 144(%rdi), %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdi), %xmm10
-; SSE-NEXT: movdqa 16(%rdi), %xmm15
-; SSE-NEXT: movdqa 32(%rdi), %xmm8
-; SSE-NEXT: movdqa 48(%rdi), %xmm14
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movdqa 160(%rdi), %xmm10
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 176(%rdi), %xmm5
+; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill
+; SSE-NEXT: movdqa (%rdi), %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 32(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 144(%rdi), %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: movdqa 128(%rdi), %xmm12
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movdqa 48(%rdi), %xmm14
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: packssdw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 240(%rdi), %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: movdqa 224(%rdi), %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0]
-; SSE-NEXT: movdqa 208(%rdi), %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
-; SSE-NEXT: movdqa 192(%rdi), %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
-; SSE-NEXT: psrad $16, %xmm15
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm10
; SSE-NEXT: psrad $16, %xmm10
-; SSE-NEXT: packssdw %xmm15, %xmm10
+; SSE-NEXT: packssdw %xmm0, %xmm10
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm9
+; SSE-NEXT: psrad $16, %xmm9
+; SSE-NEXT: packssdw %xmm0, %xmm9
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm11
+; SSE-NEXT: psrad $16, %xmm11
+; SSE-NEXT: packssdw %xmm0, %xmm11
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm13
+; SSE-NEXT: psrad $16, %xmm13
+; SSE-NEXT: packssdw %xmm0, %xmm13
+; SSE-NEXT: movdqa 240(%rdi), %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: movdqa 224(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm15
+; SSE-NEXT: pslld $16, %xmm15
+; SSE-NEXT: psrad $16, %xmm15
+; SSE-NEXT: packssdw %xmm0, %xmm15
+; SSE-NEXT: movdqa 80(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: movdqa 64(%rdi), %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm4
+; SSE-NEXT: pslld $16, %xmm4
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm1, %xmm4
+; SSE-NEXT: movdqa 208(%rdi), %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pslld $16, %xmm6
+; SSE-NEXT: psrad $16, %xmm6
+; SSE-NEXT: movdqa 192(%rdi), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: packssdw %xmm6, %xmm1
; SSE-NEXT: psrad $16, %xmm14
-; SSE-NEXT: psrad $16, %xmm8
-; SSE-NEXT: packssdw %xmm14, %xmm8
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm14, %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm6
+; SSE-NEXT: packssdw %xmm0, %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm14
+; SSE-NEXT: packssdw %xmm0, %xmm14
; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: psrad $16, %xmm13
-; SSE-NEXT: packssdw %xmm3, %xmm13
+; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: packssdw %xmm3, %xmm5
+; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm6
+; SSE-NEXT: packssdw %xmm0, %xmm6
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: psrad $16, %xmm9
-; SSE-NEXT: packssdw %xmm3, %xmm9
-; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm3
; SSE-NEXT: psrad $16, %xmm12
-; SSE-NEXT: packssdw %xmm4, %xmm12
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: psrad $16, %xmm11
-; SSE-NEXT: packssdw %xmm3, %xmm11
+; SSE-NEXT: psrad $16, %xmm7
+; SSE-NEXT: packssdw %xmm12, %xmm7
+; SSE-NEXT: psrad $16, %xmm8
; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm2, %xmm0
-; SSE-NEXT: psrad $16, %xmm5
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: packssdw %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm6, 96(%rsi)
-; SSE-NEXT: movdqa %xmm7, 112(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, 64(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, 80(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, 32(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, 48(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, (%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, 16(%rsi)
-; SSE-NEXT: movdqa %xmm1, 112(%rdx)
-; SSE-NEXT: movdqa %xmm0, 96(%rdx)
-; SSE-NEXT: movdqa %xmm11, 80(%rdx)
-; SSE-NEXT: movdqa %xmm12, 64(%rdx)
-; SSE-NEXT: movdqa %xmm9, 48(%rdx)
-; SSE-NEXT: movdqa %xmm13, 32(%rdx)
-; SSE-NEXT: movdqa %xmm8, 16(%rdx)
-; SSE-NEXT: movdqa %xmm10, (%rdx)
-; SSE-NEXT: addq $24, %rsp
+; SSE-NEXT: packssdw %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm1, 96(%rsi)
+; SSE-NEXT: movdqa %xmm4, 32(%rsi)
+; SSE-NEXT: movdqa %xmm15, 112(%rsi)
+; SSE-NEXT: movdqa %xmm13, 48(%rsi)
+; SSE-NEXT: movdqa %xmm11, 64(%rsi)
+; SSE-NEXT: movdqa %xmm9, (%rsi)
+; SSE-NEXT: movdqa %xmm10, 80(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm2, 96(%rdx)
+; SSE-NEXT: movdqa %xmm7, 112(%rdx)
+; SSE-NEXT: movdqa %xmm3, 64(%rdx)
+; SSE-NEXT: movdqa %xmm6, 80(%rdx)
+; SSE-NEXT: movdqa %xmm5, 32(%rdx)
+; SSE-NEXT: movdqa %xmm14, 48(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 16(%rdx)
+; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i16_stride2_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index ea4549aa67b98..cd68a3093bb16 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -2526,13 +2526,11 @@ define <8 x i16> @shuffle_v8i16_04040404(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_02468ACE(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_02468ACE:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_02468ACE:
@@ -2898,13 +2896,11 @@ define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_02468ace(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_02468ace:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_02468ace:
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 6f2e05b3e8387..1688ce6611c8f 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1588,13 +1588,11 @@ entry:
define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: trunc2x4i32_8i16:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc2x4i32_8i16:
More information about the llvm-commits
mailing list