[llvm] 8eeeddb - [X86] lowerV8I16GeneralSingleInputShuffle - for splat PSHUFW+PSHUFD patterns, widen the splats to encourage combines (#129854)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 06:26:02 PST 2025
Author: Simon Pilgrim
Date: 2025-03-06T14:25:57Z
New Revision: 8eeeddb5ed525bc41ddf96c96c509c32963e3a84
URL: https://github.com/llvm/llvm-project/commit/8eeeddb5ed525bc41ddf96c96c509c32963e3a84
DIFF: https://github.com/llvm/llvm-project/commit/8eeeddb5ed525bc41ddf96c96c509c32963e3a84.diff
LOG: [X86] lowerV8I16GeneralSingleInputShuffle - for splat PSHUFW+PSHUFD patterns, widen the splats to encourage combines (#129854)
For vXi16 patterns that lower to splats, ensure that PSHUFW mask splats to the entire LW/HW i64 half and then create a wide PSHUFD mask that splats the whole i64 element - this encourages further combines without depending on any unused elements from undef shuffle mask elements.
Fixes #129276
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/avx-splat.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
llvm/test/CodeGen/X86/cast-vsel.ll
llvm/test/CodeGen/X86/combine-and.ll
llvm/test/CodeGen/X86/combine-pavg.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/icmp-pow2-mask.ll
llvm/test/CodeGen/X86/insertelement-var-index.ll
llvm/test/CodeGen/X86/memset-inline.ll
llvm/test/CodeGen/X86/memset-nonzero.ll
llvm/test/CodeGen/X86/pr46527.ll
llvm/test/CodeGen/X86/pr62014.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
llvm/test/CodeGen/X86/vec_set-H.ll
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
llvm/test/CodeGen/X86/vector-rotate-128.ll
llvm/test/CodeGen/X86/vector-rotate-256.ll
llvm/test/CodeGen/X86/vector-sext.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-trunc.ll
llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
llvm/test/CodeGen/X86/vector-zext.ll
llvm/test/CodeGen/X86/widened-broadcast.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fb799580f835a..1697da09e0f72 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13776,6 +13776,14 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
DWordPairs.resize(2, std::make_pair(-1, -1));
int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
DWordPairs[1].first, DWordPairs[1].second};
+ // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
+ if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
+ ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
+ int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
+ std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
+ PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
+ PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
+ }
if ((NumHToL + NumHToH) == 0)
return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
if ((NumLToL + NumLToH) == 0)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 792cc3dd49034..1102044038afe 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -2363,7 +2363,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE2-NEXT: paddb (%rdx), %xmm0
; SSE2-NEXT: movdqa 16(%rdx), %xmm2
; SSE2-NEXT: paddb %xmm1, %xmm2
@@ -2384,7 +2384,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE42-NEXT: paddb (%rdx), %xmm0
; SSE42-NEXT: movdqa 16(%rdx), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
@@ -2405,7 +2405,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -2497,7 +2497,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm2
@@ -2762,7 +2762,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm2
@@ -2783,7 +2783,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE42-NEXT: paddb (%rdx), %xmm0
; SSE42-NEXT: movdqa 16(%rdx), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
@@ -2802,7 +2802,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
@@ -3549,7 +3549,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb (%rdx), %xmm2
@@ -3568,7 +3568,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; SSE42-NEXT: paddb (%rdx), %xmm1
; SSE42-NEXT: movdqa 16(%rdx), %xmm2
@@ -3586,7 +3586,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
@@ -4951,7 +4951,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: paddb (%rsi), %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa 16(%rdx), %xmm1
; SSE-NEXT: paddb %xmm0, %xmm1
; SSE-NEXT: movdqa (%rdx), %xmm2
@@ -4970,7 +4970,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index ca589e63b671a..305509ca7fc3f 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1837,7 +1837,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: paddb %xmm1, %xmm2
@@ -1856,7 +1856,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: movdqa 16(%rsi), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
@@ -1875,9 +1875,9 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
@@ -1954,7 +1954,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: paddb (%rsi), %xmm1
@@ -2174,7 +2174,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
@@ -2192,7 +2192,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE42-NEXT: movdqa 16(%rsi), %xmm2
; SSE42-NEXT: paddb %xmm1, %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
@@ -2208,9 +2208,9 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
@@ -2835,7 +2835,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
@@ -2850,7 +2850,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; SSE42: # %bb.0:
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa 48(%rdi), %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; SSE42-NEXT: paddb (%rsi), %xmm1
@@ -2865,7 +2865,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
@@ -3913,7 +3913,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; SSE-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa 16(%rsi), %xmm1
; SSE-NEXT: paddb %xmm0, %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm2
@@ -3930,7 +3930,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index b7dea5f4151e2..7d0a5679936da 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -1899,7 +1899,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X86-NEXT: retl
;
@@ -1907,7 +1907,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll
index 5d3a6e3e41217..15c2aab1a82e1 100644
--- a/llvm/test/CodeGen/X86/avx-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-splat.ll
@@ -17,7 +17,7 @@ define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index efb7ba393721b..423f2c49e70e5 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -97,7 +97,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -107,7 +107,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -286,7 +286,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
@@ -300,7 +300,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
@@ -519,7 +519,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm2
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
@@ -528,7 +528,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
@@ -540,7 +540,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -550,7 +550,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 7863de8754fa0..f478fa5a1f6cd 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -122,7 +122,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -133,7 +133,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -360,7 +360,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
@@ -376,7 +376,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
@@ -658,7 +658,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm2
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
@@ -669,7 +669,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
; SSE2-SSSE3-NEXT: psrlw $15, %xmm1
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
@@ -683,7 +683,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -695,7 +695,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 33a0946c8fbe9..2a79dae43bb2f 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -94,7 +94,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -105,7 +105,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -137,7 +137,7 @@ define <8 x i1> @bitcast_i8_8i1_freeze(i8 zeroext %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -148,7 +148,7 @@ define <8 x i1> @bitcast_i8_8i1_freeze(i8 zeroext %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll
index d5ccc152abc23..79513b205933e 100644
--- a/llvm/test/CodeGen/X86/cast-vsel.ll
+++ b/llvm/test/CodeGen/X86/cast-vsel.ll
@@ -397,10 +397,10 @@ define dso_local void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movd %esi, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000
; SSE2-NEXT: .p2align 4
; SSE2-NEXT: .LBB6_1: # %vector.body
@@ -429,10 +429,10 @@ define dso_local void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE41-NEXT: movd %esi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000
; SSE41-NEXT: .p2align 4
; SSE41-NEXT: .LBB6_1: # %vector.body
@@ -458,10 +458,10 @@ define dso_local void @example24(i16 signext %x, i16 signext %y) nounwind {
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index cbdc867ddeb00..e5594dc9c5e3c 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -849,7 +849,7 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) {
; SSE: # %bb.0:
; SSE-NEXT: movd %edi, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -858,7 +858,7 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index cb2d426a52b4b..4a75eb182d79d 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -86,14 +86,14 @@ define <8 x i16> @combine_pavgw_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
; SSE: # %bb.0:
; SSE-NEXT: pavgw %xmm1, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_pavgw_demandedelts:
; AVX1: # %bb.0:
; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pavgw_demandedelts:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index fb836cd2480a7..1b98886ba24e7 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1253,13 +1253,13 @@ define <8 x half> @shuffle(ptr %p) {
; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: movdqu (%rdi), %xmm0
; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: shuffle:
; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4]
-; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: shuffle:
@@ -1267,7 +1267,7 @@ define <8 x half> @shuffle(ptr %p) {
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movdqu (%eax), %xmm0
; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-I686-NEXT: retl
%1 = load <8 x half>, ptr %p, align 8
%2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
index b0fe43a647716..af3b07bc131a9 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -56,7 +56,7 @@ define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -70,7 +70,7 @@ define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index 16946caf9a328..d151c6f28e51b 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -33,7 +33,7 @@ define <16 x i8> @arg_i8_v16i8_undef(i8 %x, i32 %y) nounwind {
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: arg_i8_v16i8_undef:
@@ -80,14 +80,14 @@ define <8 x i16> @arg_i16_v8i16_undef(i16 %x, i32 %y) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: movd %edi, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i16_v8i16_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: arg_i16_v8i16_undef:
@@ -239,7 +239,7 @@ define <16 x i8> @load_i8_v16i8_undef(ptr %p, i32 %y) nounwind {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_i8_v16i8_undef:
@@ -284,7 +284,7 @@ define <8 x i16> @load_i16_v8i16_undef(ptr %p, i32 %y) nounwind {
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i16_v8i16_undef:
@@ -292,7 +292,7 @@ define <8 x i16> @load_i16_v8i16_undef(ptr %p, i32 %y) nounwind {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i16_v8i16_undef:
@@ -465,7 +465,7 @@ define <16 x i16> @arg_i16_v16i16_undef(i16 %x, i32 %y) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -690,7 +690,7 @@ define <16 x i16> @load_i16_v16i16_undef(ptr %p, i32 %y) nounwind {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/memset-inline.ll b/llvm/test/CodeGen/X86/memset-inline.ll
index 905a0ffda061f..d3999c01a5d71 100644
--- a/llvm/test/CodeGen/X86/memset-inline.ll
+++ b/llvm/test/CodeGen/X86/memset-inline.ll
@@ -188,7 +188,7 @@ define void @aligned_memset_16(ptr align 16 %a, i8 %value) nounwind {
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
;
@@ -224,7 +224,7 @@ define void @aligned_memset_32(ptr align 32 %a, i8 %value) nounwind {
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
@@ -264,7 +264,7 @@ define void @aligned_memset_64(ptr align 64 %a, i8 %value) nounwind {
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa %xmm0, 48(%rdi)
; SSE2-NEXT: movdqa %xmm0, 32(%rdi)
; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll
index 96ac8dff79530..d07b0f64d68c1 100644
--- a/llvm/test/CodeGen/X86/memset-nonzero.ll
+++ b/llvm/test/CodeGen/X86/memset-nonzero.ll
@@ -291,7 +291,7 @@ define void @memset_16_nonconst_bytes(ptr %x, i8 %c) {
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
; SSE2FAST-NEXT: retq
;
@@ -337,7 +337,7 @@ define void @memset_32_nonconst_bytes(ptr %x, i8 %c) {
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
; SSE2FAST-NEXT: retq
@@ -391,7 +391,7 @@ define void @memset_64_nonconst_bytes(ptr %x, i8 %c) {
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
@@ -466,7 +466,7 @@ define void @memset_128_nonconst_bytes(ptr %x, i8 %c) {
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
@@ -533,7 +533,7 @@ define void @memset_256_nonconst_bytes(ptr %x, i8 %c) {
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi)
diff --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll
index ab454cfe470f3..a624055dc1041 100644
--- a/llvm/test/CodeGen/X86/pr46527.ll
+++ b/llvm/test/CodeGen/X86/pr46527.ll
@@ -19,7 +19,7 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) {
; CHECK-NEXT: movd %edx, %xmm1
; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; CHECK-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; CHECK-NEXT: paddb %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1
diff --git a/llvm/test/CodeGen/X86/pr62014.ll b/llvm/test/CodeGen/X86/pr62014.ll
index a1ce577c0a024..19a6962731b6a 100644
--- a/llvm/test/CodeGen/X86/pr62014.ll
+++ b/llvm/test/CodeGen/X86/pr62014.ll
@@ -142,7 +142,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8
; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
@@ -157,7 +157,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movd %edi, %xmm0
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
; SSE42-NEXT: pand %xmm3, %xmm0
; SSE42-NEXT: pcmpeqw %xmm3, %xmm0
@@ -275,7 +275,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f
; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pcmpeqw %xmm6, %xmm5
@@ -301,7 +301,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f
; SSE42-NEXT: movaps %xmm0, %xmm4
; SSE42-NEXT: movd %edi, %xmm0
; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
; SSE42-NEXT: pmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
; SSE42-NEXT: pand %xmm5, %xmm6
; SSE42-NEXT: pcmpeqw %xmm5, %xmm6
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 9656822d144e4..61e3611dcedc9 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -258,7 +258,7 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: psubusw %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -266,7 +266,7 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -331,7 +331,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: psubusb %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -546,7 +546,7 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
; SSE-NEXT: retq
@@ -556,7 +556,7 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -686,7 +686,7 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: movd %edi, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: psubusb %xmm2, %xmm0
; SSE2-NEXT: psubusb %xmm2, %xmm1
; SSE2-NEXT: retq
@@ -1220,7 +1220,7 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: movd %edi, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE2-NEXT: psubusb %xmm4, %xmm0
; SSE2-NEXT: psubusb %xmm4, %xmm1
; SSE2-NEXT: psubusb %xmm4, %xmm2
@@ -1291,7 +1291,7 @@ define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm4
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE-NEXT: psubusw %xmm4, %xmm0
; SSE-NEXT: psubusw %xmm4, %xmm1
; SSE-NEXT: psubusw %xmm4, %xmm2
@@ -1303,7 +1303,7 @@ define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovd %edi, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index ca85afc352a57..dbdc45abb24d6 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -4343,8 +4343,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X86-SSE-NEXT: # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X86-SSE-NEXT: # xmm0 = xmm0[0,1,0,1]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_set1_epi8:
@@ -4369,8 +4369,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X64-SSE-NEXT: # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X64-SSE-NEXT: # xmm0 = xmm0[0,1,0,1]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_set1_epi8:
@@ -4393,8 +4393,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X32-SSE-NEXT: # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X32-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X32-SSE-NEXT: # xmm0 = xmm0[0,1,0,1]
; X32-SSE-NEXT: retq # encoding: [0xc3]
;
; X32-AVX1-LABEL: test_mm_set1_epi8:
@@ -4435,8 +4435,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
; X86-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X86-SSE-NEXT: # xmm0 = xmm0[0,1,0,1]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_set1_epi16:
@@ -4445,8 +4445,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X86-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
; X86-AVX1-NEXT: vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-AVX1-NEXT: vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00]
-; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X86-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44]
+; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512-LABEL: test_mm_set1_epi16:
@@ -4460,8 +4460,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X64-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
; X64-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X64-SSE-NEXT: # xmm0 = xmm0[0,1,0,1]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_set1_epi16:
@@ -4469,8 +4469,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X64-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
; X64-AVX1-NEXT: vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00]
; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-AVX1-NEXT: vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00]
-; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44]
+; X64-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512-LABEL: test_mm_set1_epi16:
@@ -4483,8 +4483,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X32-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
; X32-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X32-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X32-SSE-NEXT: # xmm0 = xmm0[0,1,0,1]
; X32-SSE-NEXT: retq # encoding: [0xc3]
;
; X32-AVX1-LABEL: test_mm_set1_epi16:
@@ -4492,8 +4492,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X32-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
; X32-AVX1-NEXT: vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00]
; X32-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X32-AVX1-NEXT: vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00]
-; X32-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
+; X32-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44]
+; X32-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1]
; X32-AVX1-NEXT: retq # encoding: [0xc3]
;
; X32-AVX512-LABEL: test_mm_set1_epi16:
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index df8a85fd07258..5bd624c0697a0 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -196,7 +196,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE2-ONLY-NEXT: movd %xmm0, %eax
; SSE2-ONLY-NEXT: movw %ax, (%rsi)
; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSE2-ONLY-NEXT: retq
;
@@ -207,7 +207,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: movw %ax, (%rsi)
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE3-NEXT: movdqa %xmm0, (%rdx)
; SSE3-NEXT: retq
;
@@ -218,7 +218,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSSE3-ONLY-NEXT: movd %xmm0, %eax
; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSSE3-ONLY-NEXT: retq
;
@@ -228,7 +228,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE41-NEXT: pxor (%rdi), %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: movdqa %xmm0, (%rdx)
; SSE41-NEXT: retq
;
@@ -238,7 +238,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE42-NEXT: pxor (%rdi), %xmm0
; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
@@ -248,7 +248,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
; AVX1-NEXT: retq
;
@@ -759,7 +759,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE2-ONLY-NEXT: movd %xmm0, %eax
; SSE2-ONLY-NEXT: movw %ax, (%rsi)
; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
; SSE2-ONLY-NEXT: retq
@@ -771,7 +771,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: movw %ax, (%rsi)
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE3-NEXT: movdqa %xmm0, (%rdx)
; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
; SSE3-NEXT: retq
@@ -783,7 +783,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSSE3-ONLY-NEXT: movd %xmm0, %eax
; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
; SSSE3-ONLY-NEXT: retq
@@ -794,7 +794,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE41-NEXT: pxor (%rdi), %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: movdqa %xmm0, (%rdx)
; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
; SSE41-NEXT: retq
@@ -805,7 +805,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE42-NEXT: pxor (%rdi), %xmm0
; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
@@ -816,7 +816,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vzeroupper
@@ -1877,7 +1877,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE2-ONLY-NEXT: movd %xmm0, %eax
; SSE2-ONLY-NEXT: movw %ax, (%rsi)
; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
@@ -1890,7 +1890,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: movw %ax, (%rsi)
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE3-NEXT: movdqa %xmm0, (%rdx)
; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
; SSE3-NEXT: movdqa %xmm0, 32(%rdx)
@@ -1903,7 +1903,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSSE3-ONLY-NEXT: movd %xmm0, %eax
; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
@@ -1915,7 +1915,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE41-NEXT: pxor (%rdi), %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: movdqa %xmm0, (%rdx)
; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
; SSE41-NEXT: movdqa %xmm0, 32(%rdx)
@@ -1927,7 +1927,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE42-NEXT: pxor (%rdi), %xmm0
; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
@@ -1939,7 +1939,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vmovaps %ymm1, (%rdx)
; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
@@ -5076,7 +5076,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE2-ONLY-NEXT: movd %xmm0, %eax
; SSE2-ONLY-NEXT: movw %ax, (%rsi)
; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
@@ -5090,7 +5090,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: movw %ax, (%rsi)
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE3-NEXT: movdqa %xmm0, (%rdx)
; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
; SSE3-NEXT: movdqa %xmm0, 32(%rdx)
@@ -5104,7 +5104,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSSE3-ONLY-NEXT: movd %xmm0, %eax
; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
@@ -5117,7 +5117,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE41-NEXT: pxor (%rdi), %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: movdqa %xmm0, (%rdx)
; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
; SSE41-NEXT: movdqa %xmm0, 32(%rdx)
@@ -5130,7 +5130,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE42-NEXT: pxor (%rdi), %xmm0
; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
@@ -5143,7 +5143,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
diff --git a/llvm/test/CodeGen/X86/vec_set-H.ll b/llvm/test/CodeGen/X86/vec_set-H.ll
index 071f9a162c387..932e899aaa44d 100644
--- a/llvm/test/CodeGen/X86/vec_set-H.ll
+++ b/llvm/test/CodeGen/X86/vec_set-H.ll
@@ -6,7 +6,7 @@ define <2 x i64> @doload64(i16 signext %x) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: retl
%tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0
%tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 6c3e4a9009975..b763b7bac2432 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1028,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 47524b2062556..c7e430de1af97 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -844,7 +844,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 4e30d706c0e20..399bdb7a248b8 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1067,7 +1067,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index bc2ea6839accd..d81d6ef7c9f28 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -884,7 +884,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 5aeeefc545d06..2d5aa0977e4b6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -560,7 +560,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: pand %xmm3, %xmm7
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: por %xmm7, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,255,255,255,255]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index d0ecb22853001..184cfa5c6ae3c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -344,7 +344,7 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
; SSE-NEXT: pand %xmm7, %xmm8
; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
; SSE-NEXT: pandn %xmm9, %xmm7
; SSE-NEXT: por %xmm8, %xmm7
; SSE-NEXT: pandn %xmm7, %xmm4
@@ -363,7 +363,7 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255]
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
@@ -701,7 +701,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
; SSE-NEXT: pand %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm4, %xmm7
@@ -737,7 +737,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: packuswb %xmm0, %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa %xmm11, %xmm14
; SSE-NEXT: pandn %xmm0, %xmm14
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3]
@@ -780,7 +780,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: pand %xmm12, %xmm9
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: pandn %xmm2, %xmm12
; SSE-NEXT: por %xmm9, %xmm12
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
@@ -792,7 +792,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: packuswb %xmm13, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
; SSE-NEXT: pandn %xmm9, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2]
@@ -819,12 +819,12 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm4
; SSE-NEXT: pandn %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
@@ -1378,7 +1378,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa (%r9), %xmm8
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: movdqa %xmm5, %xmm1
@@ -1395,7 +1395,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pandn %xmm2, %xmm1
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
; SSE-NEXT: pand %xmm10, %xmm3
; SSE-NEXT: movdqa %xmm4, %xmm9
@@ -1410,7 +1410,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pand %xmm0, %xmm10
; SSE-NEXT: por %xmm1, %xmm10
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pandn %xmm1, %xmm4
@@ -1492,11 +1492,11 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm4
; SSE-NEXT: movdqa %xmm5, %xmm15
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm13, %xmm1
; SSE-NEXT: por %xmm1, %xmm4
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: pandn %xmm1, %xmm7
; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -1539,7 +1539,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7]
; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pand %xmm5, %xmm0
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1566,12 +1566,12 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm7
; SSE-NEXT: movdqa %xmm9, %xmm5
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pand %xmm13, %xmm1
; SSE-NEXT: por %xmm1, %xmm7
; SSE-NEXT: movdqa %xmm6, %xmm8
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: movdqa %xmm2, %xmm9
; SSE-NEXT: pandn %xmm1, %xmm9
; SSE-NEXT: pand %xmm2, %xmm7
@@ -1590,7 +1590,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pand %xmm10, %xmm9
; SSE-NEXT: por %xmm7, %xmm9
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: pandn %xmm0, %xmm7
; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload
@@ -1655,7 +1655,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm7, %xmm9
; SSE-NEXT: pand %xmm10, %xmm9
; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1]
; SSE-NEXT: pandn %xmm7, %xmm10
; SSE-NEXT: por %xmm9, %xmm10
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
@@ -1680,7 +1680,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm2, %xmm6
; SSE-NEXT: por %xmm11, %xmm6
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
@@ -1689,7 +1689,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pand %xmm12, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pandn %xmm0, %xmm12
; SSE-NEXT: por %xmm1, %xmm12
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
@@ -2660,7 +2660,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa 16(%r9), %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
; SSE-NEXT: pand %xmm10, %xmm0
; SSE-NEXT: movdqa %xmm4, %xmm8
@@ -2680,7 +2680,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
@@ -2727,7 +2727,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm10, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm10, %xmm0
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
@@ -2739,14 +2739,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: por %xmm1, %xmm3
; SSE-NEXT: pand %xmm2, %xmm3
; SSE-NEXT: por %xmm0, %xmm3
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7]
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pand %xmm9, %xmm3
@@ -2880,14 +2880,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm14, %xmm0
@@ -2966,7 +2966,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm1, %xmm3
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm2, %xmm14
; SSE-NEXT: pandn %xmm1, %xmm14
@@ -3001,11 +3001,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm13, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pand %xmm13, %xmm1
; SSE-NEXT: por %xmm1, %xmm3
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: movdqa %xmm15, %xmm4
; SSE-NEXT: pandn %xmm1, %xmm4
; SSE-NEXT: pand %xmm15, %xmm3
@@ -3031,7 +3031,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm9, %xmm4
; SSE-NEXT: pandn %xmm0, %xmm4
; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -3102,11 +3102,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm3, %xmm4
; SSE-NEXT: movdqa %xmm13, %xmm5
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; SSE-NEXT: pand %xmm2, %xmm3
; SSE-NEXT: por %xmm3, %xmm4
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; SSE-NEXT: movdqa %xmm15, %xmm10
; SSE-NEXT: pandn %xmm3, %xmm10
; SSE-NEXT: pand %xmm15, %xmm4
@@ -3121,7 +3121,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: por %xmm0, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
@@ -3144,11 +3144,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm13, %xmm10
; SSE-NEXT: pandn %xmm4, %xmm10
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE-NEXT: pand %xmm13, %xmm4
; SSE-NEXT: por %xmm4, %xmm10
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE-NEXT: movdqa %xmm9, %xmm2
; SSE-NEXT: pandn %xmm4, %xmm2
; SSE-NEXT: pand %xmm9, %xmm10
@@ -3171,7 +3171,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE-NEXT: movdqa %xmm9, %xmm5
; SSE-NEXT: pandn %xmm3, %xmm5
; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
@@ -3210,7 +3210,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: por %xmm1, %xmm2
@@ -3232,11 +3232,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm13, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pand %xmm13, %xmm1
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: pand %xmm9, %xmm2
@@ -3246,7 +3246,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm0, %xmm7
; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7]
@@ -3255,7 +3255,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm0, %xmm13
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -3270,7 +3270,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -3320,7 +3320,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm4, %xmm11
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pandn %xmm1, %xmm4
; SSE-NEXT: por %xmm11, %xmm4
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
@@ -5352,7 +5352,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm5, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm5
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: por %xmm3, %xmm5
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
@@ -5360,7 +5360,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm5, %xmm3
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6]
; SSE-NEXT: movdqa %xmm6, %xmm15
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm6
; SSE-NEXT: pandn %xmm5, %xmm6
; SSE-NEXT: movdqa %xmm10, %xmm5
@@ -5410,7 +5410,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm14, %xmm13
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm11
; SSE-NEXT: pandn %xmm6, %xmm11
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,2,2,3,4,5,6,7]
@@ -5450,7 +5450,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm11, %xmm6
; SSE-NEXT: pandn %xmm5, %xmm6
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; SSE-NEXT: pand %xmm11, %xmm5
; SSE-NEXT: por %xmm5, %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
@@ -5461,13 +5461,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm4, %xmm6
; SSE-NEXT: pandn %xmm0, %xmm6
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: por %xmm0, %xmm6
; SSE-NEXT: pand %xmm8, %xmm6
; SSE-NEXT: por %xmm5, %xmm6
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm5
; SSE-NEXT: pandn %xmm0, %xmm5
; SSE-NEXT: pand %xmm7, %xmm6
@@ -5672,7 +5672,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm12, %xmm0
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
@@ -5680,7 +5680,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -5727,7 +5727,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm12, %xmm1
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm14, %xmm12
@@ -5735,7 +5735,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm2, %xmm3
; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -5781,13 +5781,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm12, %xmm3
; SSE-NEXT: pandn %xmm2, %xmm3
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -5828,7 +5828,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm6, %xmm8
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
@@ -5858,12 +5858,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm2, %xmm15
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm2, %xmm15
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: movdqa %xmm7, %xmm10
; SSE-NEXT: pandn %xmm2, %xmm10
; SSE-NEXT: pand %xmm7, %xmm15
@@ -5904,11 +5904,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm10, %xmm15
; SSE-NEXT: movdqa %xmm13, %xmm3
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1]
; SSE-NEXT: pand %xmm0, %xmm10
; SSE-NEXT: por %xmm10, %xmm15
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
; SSE-NEXT: movdqa %xmm13, %xmm0
; SSE-NEXT: pandn %xmm10, %xmm0
@@ -5956,7 +5956,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: por %xmm2, %xmm5
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm1, %xmm8
; SSE-NEXT: pandn %xmm2, %xmm8
@@ -5977,7 +5977,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: pandn %xmm0, %xmm5
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm8
; SSE-NEXT: pandn %xmm0, %xmm8
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -6014,7 +6014,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm12, %xmm2
; SSE-NEXT: pandn %xmm0, %xmm2
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm12, %xmm0
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
@@ -6026,14 +6026,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: pandn %xmm6, %xmm2
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; SSE-NEXT: pand %xmm3, %xmm6
; SSE-NEXT: movdqa %xmm3, %xmm10
; SSE-NEXT: por %xmm6, %xmm2
; SSE-NEXT: pand %xmm5, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm6
; SSE-NEXT: pandn %xmm0, %xmm6
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7]
@@ -6062,7 +6062,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm0, %xmm2
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
@@ -6089,12 +6089,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm2, %xmm6
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: pand %xmm12, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: movdqa %xmm7, %xmm8
; SSE-NEXT: pandn %xmm2, %xmm8
; SSE-NEXT: pand %xmm7, %xmm6
@@ -6130,11 +6130,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm10, %xmm6
; SSE-NEXT: pandn %xmm2, %xmm6
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: pand %xmm10, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE-NEXT: movdqa %xmm15, %xmm8
; SSE-NEXT: pandn %xmm2, %xmm8
; SSE-NEXT: pand %xmm15, %xmm6
@@ -6179,7 +6179,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pandn %xmm8, %xmm4
@@ -6200,7 +6200,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: pandn %xmm0, %xmm6
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm8
; SSE-NEXT: pandn %xmm0, %xmm8
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
@@ -6239,7 +6239,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm6, %xmm3
; SSE-NEXT: pandn %xmm0, %xmm6
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm14
; SSE-NEXT: por %xmm0, %xmm6
@@ -6252,14 +6252,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: pandn %xmm6, %xmm0
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; SSE-NEXT: pand %xmm4, %xmm6
; SSE-NEXT: movdqa %xmm4, %xmm11
; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: por %xmm8, %xmm0
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm8
; SSE-NEXT: pandn %xmm6, %xmm8
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7]
@@ -6288,7 +6288,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm0, %xmm8
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: por %xmm0, %xmm8
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
@@ -6315,12 +6315,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm8, %xmm10
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
; SSE-NEXT: pand %xmm14, %xmm8
; SSE-NEXT: por %xmm8, %xmm10
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
; SSE-NEXT: movdqa %xmm7, %xmm15
; SSE-NEXT: pandn %xmm8, %xmm15
; SSE-NEXT: pand %xmm7, %xmm10
@@ -6359,11 +6359,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm2, %xmm10
; SSE-NEXT: pandn %xmm8, %xmm10
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
; SSE-NEXT: pand %xmm2, %xmm8
; SSE-NEXT: por %xmm8, %xmm10
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
; SSE-NEXT: movdqa %xmm2, %xmm15
; SSE-NEXT: pandn %xmm8, %xmm15
@@ -6409,7 +6409,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm14, %xmm4
; SSE-NEXT: por %xmm0, %xmm15
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm1, %xmm9
; SSE-NEXT: pandn %xmm0, %xmm9
@@ -6426,7 +6426,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: por %xmm10, %xmm0
; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm15
; SSE-NEXT: pandn %xmm10, %xmm15
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
@@ -6464,7 +6464,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm14, %xmm10
; SSE-NEXT: pandn %xmm0, %xmm10
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm14, %xmm0
; SSE-NEXT: por %xmm0, %xmm10
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7]
@@ -6473,7 +6473,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm1, %xmm15
; SSE-NEXT: pandn %xmm0, %xmm15
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm15
@@ -6483,7 +6483,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm15, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm7, %xmm10
; SSE-NEXT: pandn %xmm0, %xmm10
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7]
@@ -6510,7 +6510,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pandn %xmm0, %xmm10
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: por %xmm0, %xmm10
@@ -6532,7 +6532,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
; SSE-NEXT: pand %xmm15, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -6543,7 +6543,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm0, %xmm15
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa %xmm7, %xmm10
; SSE-NEXT: pandn %xmm0, %xmm10
; SSE-NEXT: pand %xmm7, %xmm15
@@ -6574,7 +6574,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: por %xmm10, %xmm3
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa %xmm6, %xmm4
; SSE-NEXT: pand %xmm6, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7]
@@ -6583,7 +6583,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm0, %xmm4
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm15
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
; SSE-NEXT: movdqa %xmm6, %xmm10
; SSE-NEXT: pandn %xmm0, %xmm10
@@ -6621,7 +6621,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm3, %xmm14
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: por %xmm14, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index b114cba14cb6c..993e6afc0eaf3 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -898,7 +898,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; XOPAVX1-LABEL: splatvar_rotate_v8i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 86c4d79a28c89..ec45f64bdb0e6 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -735,7 +735,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 8620651c1c199..f57efb40bf0e3 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -2126,7 +2126,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
@@ -2137,7 +2137,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -2148,7 +2148,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
@@ -2159,7 +2159,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -2198,7 +2198,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
; X86-SSE2-NEXT: movzbl (%eax), %eax
; X86-SSE2-NEXT: movd %eax, %xmm0
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pcmpeqw %xmm1, %xmm0
@@ -2210,7 +2210,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
; X86-SSE41-NEXT: movzbl (%eax), %eax
; X86-SSE41-NEXT: movd %eax, %xmm0
; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X86-SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; X86-SSE41-NEXT: pand %xmm1, %xmm0
; X86-SSE41-NEXT: pcmpeqw %xmm1, %xmm0
@@ -2654,7 +2654,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -2669,7 +2669,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pand %xmm2, %xmm0
@@ -2684,7 +2684,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
; SSE41-NEXT: movzwl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pand %xmm2, %xmm0
@@ -2699,7 +2699,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
@@ -2737,7 +2737,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
; X86-SSE2-NEXT: movzwl (%eax), %eax
; X86-SSE2-NEXT: movd %eax, %xmm0
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2753,7 +2753,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
; X86-SSE41-NEXT: movzwl (%eax), %eax
; X86-SSE41-NEXT: movd %eax, %xmm0
; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; X86-SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; X86-SSE41-NEXT: movdqa %xmm1, %xmm0
; X86-SSE41-NEXT: pand %xmm2, %xmm0
@@ -3764,7 +3764,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSE2-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
@@ -3804,7 +3804,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd %edi, %xmm0
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
@@ -3844,7 +3844,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; SSE41-NEXT: psllq $58, %xmm1
@@ -3879,7 +3879,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsllw $10, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $10, %xmm0, %xmm1
@@ -3920,7 +3920,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; X86-SSE2: # %bb.0: # %entry
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; X86-SSE2-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
@@ -3960,7 +3960,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; X86-SSE41: # %bb.0: # %entry
; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; X86-SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; X86-SSE41-NEXT: psllq $58, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 54056461bff8c..60295f1c145a1 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -803,7 +803,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; SSE2-NEXT: psrlw %xmm1, %xmm2
@@ -917,7 +917,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; X86-SSE-NEXT: pand %xmm2, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-SSE-NEXT: psrlw %xmm1, %xmm2
@@ -1107,7 +1107,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; SSE2-NEXT: psrlw %xmm1, %xmm2
@@ -1226,7 +1226,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; X86-SSE-NEXT: pand %xmm2, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-SSE-NEXT: psrlw %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index e577388d88c2c..4f8cbc07243fd 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1311,7 +1311,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; SSE2-NEXT: psrlw %xmm1, %xmm2
@@ -1426,7 +1426,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; X86-SSE-NEXT: pand %xmm2, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-SSE-NEXT: psrlw %xmm1, %xmm2
@@ -1449,7 +1449,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; SSE2-NEXT: psrlw %xmm1, %xmm2
@@ -1564,7 +1564,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; X86-SSE-NEXT: pand %xmm2, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-SSE-NEXT: psrlw %xmm1, %xmm2
@@ -1587,7 +1587,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; SSE2-NEXT: psrlw %xmm1, %xmm2
@@ -1693,7 +1693,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; X86-SSE-NEXT: pand %xmm2, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-SSE-NEXT: psrlw %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 467c1574180da..1d1697aa38bae 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -658,7 +658,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -756,7 +756,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -904,7 +904,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1007,7 +1007,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%mod = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index e0d1c256ebecf..79281116665f6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1077,7 +1077,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1176,7 +1176,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -1195,7 +1195,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1294,7 +1294,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -1313,7 +1313,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1403,7 +1403,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; X86-SSE-NEXT: psrlw $8, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 4dda9ff09cc62..2b1cf5b671e53 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -568,7 +568,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: psllw %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -663,7 +663,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-SSE-NEXT: psllw %xmm1, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -810,7 +810,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; SSE2-NEXT: psllw %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -910,7 +910,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; X86-SSE-NEXT: psllw %xmm1, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%mod = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index f213db0349c31..d245bdca6ee29 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -929,7 +929,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-NEXT: psllw %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1024,7 +1024,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; X86-SSE-NEXT: psllw %xmm1, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -1042,7 +1042,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-NEXT: psllw %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1137,7 +1137,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; X86-SSE-NEXT: psllw %xmm1, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -1155,7 +1155,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-NEXT: psllw %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1243,7 +1243,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; X86-SSE-NEXT: psllw %xmm1, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE-NEXT: pand %xmm1, %xmm0
; X86-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index b1c90aa8021b8..442bfde6a9e2e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -18,7 +18,7 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -235,13 +235,13 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_0101010101010101:
; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
@@ -252,7 +252,7 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
; XOPAVX1-LABEL: shuffle_v16i8_0101010101010101:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i8_0101010101010101:
@@ -2274,7 +2274,7 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(ptr %ptr) {
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
@@ -2328,7 +2328,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(ptr %ptr) {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
@@ -2386,7 +2386,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(ptr %ptr) {
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
@@ -2435,7 +2435,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(ptr %ptr) {
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
@@ -2485,7 +2485,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(ptr %ptr) {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
@@ -2553,7 +2553,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(ptr %ptr) {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 212cde9fcd6b2..f3659fd934e71 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -68,13 +68,13 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_00000000:
; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_00000000:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_00000000:
@@ -85,7 +85,7 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
; XOPAVX1-LABEL: shuffle_v8i16_00000000:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v8i16_00000000:
@@ -3191,14 +3191,14 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(ptr %ptr) {
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32:
@@ -3210,7 +3210,7 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_mem_v8i16_i32:
@@ -3230,7 +3230,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(ptr %ptr) {
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16:
@@ -3238,7 +3238,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(ptr %ptr) {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
@@ -3251,7 +3251,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(ptr %ptr) {
; XOPAVX1-NEXT: movzwl (%rdi), %eax
; XOPAVX1-NEXT: vmovd %eax, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16:
@@ -3271,14 +3271,14 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(ptr %ptr) {
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32:
@@ -3290,7 +3290,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i32:
@@ -3309,7 +3309,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
@@ -3328,7 +3328,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32:
@@ -3340,7 +3340,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss (%rdi), %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i32:
@@ -3360,7 +3360,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(ptr %ptr) {
; SSE-NEXT: movswl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
@@ -3368,7 +3368,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(ptr %ptr) {
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
@@ -3391,7 +3391,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(ptr %ptr) {
; XOPAVX1-NEXT: movswl (%rdi), %eax
; XOPAVX1-NEXT: vmovd %eax, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
@@ -3415,7 +3415,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(ptr %ptr) {
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
@@ -3481,14 +3481,14 @@ define <8 x i16> @insert_dup_mem_v8i16_i64(ptr %ptr) {
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i64:
@@ -3500,7 +3500,7 @@ define <8 x i16> @insert_dup_mem_v8i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_mem_v8i16_i64:
@@ -3519,14 +3519,14 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(ptr %ptr) {
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i64:
@@ -3538,7 +3538,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
@@ -3557,14 +3557,14 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(ptr %ptr) {
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
@@ -3576,7 +3576,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
@@ -3596,7 +3596,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(ptr %ptr) {
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt7_mem_v8i16_i64:
@@ -3615,7 +3615,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(ptr %ptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
@@ -3627,7 +3627,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
@@ -3647,7 +3647,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(ptr %ptr) {
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
@@ -3655,7 +3655,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(ptr %ptr) {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
@@ -3668,7 +3668,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(ptr %ptr) {
; XOPAVX1-NEXT: movzwl (%rdi), %eax
; XOPAVX1-NEXT: vmovd %eax, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 4ebf4b3c5668a..154533d5ec107 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -13,7 +13,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -25,7 +25,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -41,7 +41,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -76,7 +76,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -96,7 +96,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -131,7 +131,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -151,7 +151,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -186,7 +186,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -206,7 +206,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -226,7 +226,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
@@ -244,7 +244,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -264,7 +264,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
@@ -282,7 +282,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -302,7 +302,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
@@ -320,7 +320,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -340,7 +340,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
@@ -362,7 +362,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -383,7 +383,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1],xmm1[0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -404,7 +404,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -425,7 +425,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1],xmm1[2,3],xmm0[0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -445,7 +445,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -466,7 +466,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1],xmm1[4,5],xmm0[0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -486,7 +486,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -507,7 +507,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1],xmm1[6,7],xmm0[0,1,0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -527,7 +527,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -548,7 +548,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1],xmm1[8,9],xmm0[0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -568,7 +568,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -589,7 +589,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1],xmm1[10,11],xmm0[0,1,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -609,7 +609,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -630,7 +630,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[12,13],xmm0[0,1,0,1,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -650,7 +650,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -671,7 +671,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[14,15],xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -691,13 +691,13 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
@@ -708,7 +708,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
@@ -722,13 +722,13 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
@@ -741,13 +741,13 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
@@ -758,7 +758,7 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
@@ -772,13 +772,13 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
ret <16 x i16> %shuffle
@@ -2437,7 +2437,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0
; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2451,7 +2451,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0
; XOPAVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2531,7 +2531,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1
; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
@@ -2546,7 +2546,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1
; XOPAVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
@@ -3452,7 +3452,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3473,7 +3473,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1],xmm1[0,1]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -6779,14 +6779,14 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
@@ -6798,7 +6798,7 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
@@ -6810,14 +6810,14 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
; XOPAVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX2-NEXT: vpbroadcastd %xmm0, %ymm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm0, %ymm0
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <16 x i16> %shuffle
@@ -6828,7 +6828,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -6860,7 +6860,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -6886,13 +6886,13 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
@@ -6903,7 +6903,7 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
@@ -6914,13 +6914,13 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a,
; XOPAVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
ret <16 x i16> %shuffle
@@ -6931,14 +6931,14 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
@@ -6951,7 +6951,7 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
@@ -6964,14 +6964,14 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
ret <16 x i16> %shuffle
@@ -7381,7 +7381,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7413,7 +7413,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7431,7 +7431,7 @@ define <16 x i16> @shuffle_v16i16_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11_1
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7463,7 +7463,7 @@ define <16 x i16> @shuffle_v16i16_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11_1
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7482,29 +7482,15 @@ define <16 x i16> @shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_1
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15:
-; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15:
; AVX512VL: # %bb.0:
@@ -7516,15 +7502,14 @@ define <16 x i16> @shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_1
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
; XOPAVX2-NEXT: retq
%r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> splat(i32 15)
ret <16 x i16> %r
@@ -7761,7 +7746,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(ptr %ptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7774,7 +7759,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7795,7 +7780,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(ptr %ptr) {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7809,7 +7794,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(ptr %ptr) {
; XOPAVX1-NEXT: movzwl (%rdi), %eax
; XOPAVX1-NEXT: vmovd %eax, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7830,7 +7815,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(ptr %ptr) #0 {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7843,7 +7828,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(ptr %ptr) #0 {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7863,7 +7848,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(ptr %ptr) #0 {
; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7876,7 +7861,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(ptr %ptr) #0 {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss (%rdi), %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7896,7 +7881,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i64(ptr %ptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7909,7 +7894,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7929,7 +7914,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(ptr %ptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7942,7 +7927,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -7962,7 +7947,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(ptr %ptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -7975,7 +7960,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(ptr %ptr) {
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -8027,7 +8012,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) {
; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -8041,7 +8026,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) {
; XOPAVX1-NEXT: movzwl (%rdi), %eax
; XOPAVX1-NEXT: vmovd %eax, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d287fb6d5b834..d08bfc6e2d7ea 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST
@@ -2416,7 +2416,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -2451,14 +2451,14 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
@@ -2471,7 +2471,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX512VLBW-SLOW: # %bb.0:
; AVX512VLBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX512VLBW-SLOW-NEXT: retq
;
; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
@@ -2500,7 +2500,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
ret <32 x i8> %shuffle
@@ -4845,29 +4845,15 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
-; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
; AVX512VL: # %bb.0:
@@ -4879,15 +4865,14 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31, i32 30, i32 31>
ret <32 x i8> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 1a5f5fd5e6db5..732cc445ddcd8 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1838,7 +1838,7 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
; SSE-LABEL: PR32160:
; SSE: # %bb.0:
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: PR32160:
@@ -1852,7 +1852,7 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
index 44bd58f9d4121..97124f0a9d8d9 100644
--- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -571,7 +571,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) {
; SSE2-NEXT: andl $7, %edi
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; SSE2-NEXT: pcmpgtw %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
@@ -583,7 +583,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) {
; SSE41-NEXT: andl $7, %edi
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
; SSE41-NEXT: pmaxuw %xmm1, %xmm0
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
@@ -594,7 +594,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) {
; AVX1-NEXT: andl $7, %edi
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -621,7 +621,7 @@ define <8 x i16> @PR47448_ugt(i16 signext %0) {
; SSE-NEXT: andl $7, %edi
; SSE-NEXT: movd %edi, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
@@ -630,7 +630,7 @@ define <8 x i16> @PR47448_ugt(i16 signext %0) {
; AVX1-NEXT: andl $7, %edi
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 74926f46ffa43..bd1a48ba5d6ec 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -2448,7 +2448,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSE2-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
@@ -2469,7 +2469,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd %edi, %xmm0
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
@@ -2490,7 +2490,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [63,63]
@@ -2510,7 +2510,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
@@ -2696,9 +2696,8 @@ define <16 x i16> @splatshuf_zext_v16i16(<16 x i8> %x) {
; SSE2: # %bb.0:
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll
index 8e47ed67bdcff..d25512900440e 100644
--- a/llvm/test/CodeGen/X86/widened-broadcast.ll
+++ b/llvm/test/CodeGen/X86/widened-broadcast.ll
@@ -261,13 +261,13 @@ define <16 x i8> @load_splat_16i8_16i8_0101010101010101(ptr %ptr) nounwind uwtab
; SSE-LABEL: load_splat_16i8_16i8_0101010101010101:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101:
@@ -341,14 +341,14 @@ define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(ptr %ptr
; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -405,14 +405,14 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(ptr %ptr
; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index b5487ad192337..409d889a9cbc3 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -2359,7 +2359,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE2-NEXT: paddb 48(%rsi), %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
@@ -2383,7 +2383,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm1
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSE42-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm2
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
@@ -2403,7 +2403,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
@@ -2507,7 +2507,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0]
@@ -2801,7 +2801,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0]
@@ -2822,7 +2822,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm1
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
; SSE42-NEXT: movdqa %xmm1, %xmm0
@@ -2843,7 +2843,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero
@@ -3685,7 +3685,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0]
@@ -3706,7 +3706,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-NEXT: paddb 48(%rsi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pxor %xmm3, %xmm3
@@ -3727,7 +3727,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 95fde98536818..ab216cafcc923 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1832,7 +1832,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn 48(%rdi), %xmm1
; SSE2-NEXT: por %xmm1, %xmm2
@@ -1853,7 +1853,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm1
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255]
; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
@@ -1871,7 +1871,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
@@ -1960,7 +1960,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0]
@@ -2213,7 +2213,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0]
@@ -2231,7 +2231,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm1
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm0
@@ -2249,7 +2249,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero
@@ -2955,7 +2955,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0]
@@ -2972,7 +2972,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; SSE42: # %bb.0:
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
@@ -2989,7 +2989,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero
More information about the llvm-commits
mailing list