[llvm] 17eafe0 - [X86][SSE] lowerV2I64Shuffle - use undef elements in PSHUFD mask widening
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 26 08:04:51 PDT 2020
Author: Simon Pilgrim
Date: 2020-07-26T16:04:22+01:00
New Revision: 17eafe0841d6e523d410771c8d4de99d5881c59d
URL: https://github.com/llvm/llvm-project/commit/17eafe0841d6e523d410771c8d4de99d5881c59d
DIFF: https://github.com/llvm/llvm-project/commit/17eafe0841d6e523d410771c8d4de99d5881c59d.diff
LOG: [X86][SSE] lowerV2I64Shuffle - use undef elements in PSHUFD mask widening
If we lower a v2i64 shuffle to PSHUFD, we currently clamp undef elements to 0, (elements 0,1 of the v4i32) which can result in the shuffle referencing more elements of the source vector than expected, affecting later shuffle combines and KnownBits/SimplifyDemanded calls.
By ensuring we widen the undef mask element we allow getV4X86ShuffleImm8 to use inline elements as the default, which are more likely to fold.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avg.ll
llvm/test/CodeGen/X86/avx-cvt.ll
llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/buildvec-extract.ll
llvm/test/CodeGen/X86/cast-vsel.ll
llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
llvm/test/CodeGen/X86/combine-movmsk-avx.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/combine-shl.ll
llvm/test/CodeGen/X86/combine-sra.ll
llvm/test/CodeGen/X86/combine-srl.ll
llvm/test/CodeGen/X86/combine-udiv.ll
llvm/test/CodeGen/X86/combine-urem.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
llvm/test/CodeGen/X86/extract-store.ll
llvm/test/CodeGen/X86/extractelement-index.ll
llvm/test/CodeGen/X86/extractelement-load.ll
llvm/test/CodeGen/X86/gather-addresses.ll
llvm/test/CodeGen/X86/haddsub-2.ll
llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
llvm/test/CodeGen/X86/horizontal-reduce-add.ll
llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
llvm/test/CodeGen/X86/i128-add.ll
llvm/test/CodeGen/X86/inline-asm-x-i128.ll
llvm/test/CodeGen/X86/known-bits-vector.ll
llvm/test/CodeGen/X86/known-signbits-vector.ll
llvm/test/CodeGen/X86/madd.ll
llvm/test/CodeGen/X86/masked_compressstore.ll
llvm/test/CodeGen/X86/masked_gather.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/masked_store.ll
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/nontemporal-2.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/phaddsub-extract.ll
llvm/test/CodeGen/X86/pmul.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/pr15267.ll
llvm/test/CodeGen/X86/pr39733.ll
llvm/test/CodeGen/X86/pr42452.ll
llvm/test/CodeGen/X86/pr42905.ll
llvm/test/CodeGen/X86/pr44976.ll
llvm/test/CodeGen/X86/pr45378.ll
llvm/test/CodeGen/X86/pr46189.ll
llvm/test/CodeGen/X86/pr46455.ll
llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/sad.ll
llvm/test/CodeGen/X86/sdiv_fix.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/setcc-wide-types.ll
llvm/test/CodeGen/X86/shrink_vmul.ll
llvm/test/CodeGen/X86/slow-pmulld.ll
llvm/test/CodeGen/X86/smul_fix_sat.ll
llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
llvm/test/CodeGen/X86/split-vector-rem.ll
llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/sse41.ll
llvm/test/CodeGen/X86/trunc-subvector.ll
llvm/test/CodeGen/X86/udiv_fix.ll
llvm/test/CodeGen/X86/udiv_fix_sat.ll
llvm/test/CodeGen/X86/uint_to_fp-3.ll
llvm/test/CodeGen/X86/umul_fix_sat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/var-permute-128.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
llvm/test/CodeGen/X86/vec_cast2.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_ssubo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
llvm/test/CodeGen/X86/vector-pcmp.ll
llvm/test/CodeGen/X86/vector-reduce-add.ll
llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
llvm/test/CodeGen/X86/vector-reduce-and.ll
llvm/test/CodeGen/X86/vector-reduce-mul.ll
llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
llvm/test/CodeGen/X86/vector-reduce-or.ll
llvm/test/CodeGen/X86/vector-reduce-smax.ll
llvm/test/CodeGen/X86/vector-reduce-smin.ll
llvm/test/CodeGen/X86/vector-reduce-umax.ll
llvm/test/CodeGen/X86/vector-reduce-umin.ll
llvm/test/CodeGen/X86/vector-reduce-xor.ll
llvm/test/CodeGen/X86/vector-rem.ll
llvm/test/CodeGen/X86/vector-rotate-128.ll
llvm/test/CodeGen/X86/vector-rotate-256.ll
llvm/test/CodeGen/X86/vector-sext.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
llvm/test/CodeGen/X86/vector-shift-shl-256.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
llvm/test/CodeGen/X86/vector-zext.ll
llvm/test/CodeGen/X86/vsel-cmp-load.ll
llvm/test/CodeGen/X86/vselect-avx.ll
llvm/test/CodeGen/X86/vselect-pcmp.ll
llvm/test/CodeGen/X86/vshift-4.ll
llvm/test/CodeGen/X86/widen_conv-4.ll
llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
llvm/test/CodeGen/X86/xor.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 24bc264df129..7c134a8c7cb9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13723,9 +13723,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
V1 = DAG.getBitcast(MVT::v4i32, V1);
- int WidenedMask[4] = {
- std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
- std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+ int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
+ Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
+ Mask[1] < 0 ? -1 : (Mask[1] * 2),
+ Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
return DAG.getBitcast(
MVT::v2i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index d2638a1681e8..1411318d8176 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -160,9 +160,9 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
@@ -172,10 +172,10 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vmovdqa (%rsi), %xmm6
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
@@ -454,24 +454,24 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
@@ -482,10 +482,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
@@ -493,10 +493,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
@@ -504,10 +504,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll
index 653a88edd26a..b3fa8ac9aeed 100644
--- a/llvm/test/CodeGen/X86/avx-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx-cvt.ll
@@ -33,7 +33,7 @@ define <8 x float> @sitofp02(<8 x i16> %a) {
; AVX-LABEL: sitofp02:
; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
index 1fd3e15c3e01..b504646336de 100644
--- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -5,7 +5,7 @@
define i32 @hadd_16(<16 x i32> %x225) {
; KNL-LABEL: hadd_16:
; KNL: # %bb.0:
-; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -14,7 +14,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
;
; SKX-LABEL: hadd_16:
; SKX: # %bb.0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -32,7 +32,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
define i32 @hsub_16(<16 x i32> %x225) {
; KNL-LABEL: hsub_16:
; KNL: # %bb.0:
-; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -41,7 +41,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
;
; SKX-LABEL: hsub_16:
; SKX: # %bb.0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index f115f9a6ef38..650bbe23b86e 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -6550,7 +6550,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vpextrd $1, %xmm0, %edx
@@ -6563,7 +6563,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: vzeroupper
@@ -6602,7 +6602,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
; X86-NEXT: vpsllq $32, %xmm2, %xmm2
; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -6636,7 +6636,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
; X64-NEXT: vpsllq $32, %xmm2, %xmm2
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -6668,7 +6668,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vpextrd $1, %xmm0, %edx
@@ -6681,7 +6681,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: vzeroupper
@@ -6706,7 +6706,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vpextrd $1, %xmm0, %edx
@@ -6719,7 +6719,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: vzeroupper
@@ -6747,7 +6747,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vpextrd $1, %xmm0, %edx
@@ -6762,7 +6762,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: vzeroupper
@@ -6807,7 +6807,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86-NEXT: vpsllq $32, %xmm2, %xmm2
; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -6844,7 +6844,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X64-NEXT: vpsllq $32, %xmm2, %xmm2
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -6882,7 +6882,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vpextrd $1, %xmm0, %edx
@@ -6898,7 +6898,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: vzeroupper
@@ -6928,7 +6928,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vpextrd $1, %xmm0, %edx
@@ -6943,7 +6943,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 863ab4dee123..2195526f94c3 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -114,21 +114,21 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: extract2_i32_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll
index 2e044548404e..6e9e4fd00636 100644
--- a/llvm/test/CodeGen/X86/cast-vsel.ll
+++ b/llvm/test/CodeGen/X86/cast-vsel.ll
@@ -31,7 +31,7 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; SSE41-NEXT: packssdw %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5
; SSE41-NEXT: pmovsxwd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -39,9 +39,9 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5
; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm5, %xmm1
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
@@ -87,7 +87,7 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; SSE41-NEXT: packssdw %xmm1, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: retq
;
@@ -459,7 +459,7 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE41-NEXT: packssdw %xmm3, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: pmovsxwd %xmm3, %xmm3
; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax)
@@ -480,9 +480,9 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB6_1: # %vector.body
diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index b96f44ec3073..16a993316d7e 100644
--- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -696,7 +696,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
; SSE2-LABEL: _clearupper16xi8b:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %r10
; SSE2-NEXT: movq %r10, %r8
; SSE2-NEXT: shrq $56, %r8
@@ -878,7 +878,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; SSE2-LABEL: _clearupper32xi8b:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %r10
; SSE2-NEXT: movq %r10, %r8
; SSE2-NEXT: shrq $56, %r8
diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
index b18c7246104d..17d01e1d3362 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -93,7 +93,7 @@ define i32 @movmskps_sext_v4i64(<4 x i32> %a0) {
; AVX1-LABEL: movmskps_sext_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovmskpd %ymm0, %eax
@@ -116,7 +116,7 @@ define i32 @movmskps_sext_v8i32(<8 x i16> %a0) {
; AVX1-LABEL: movmskps_sext_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 1a52ebfc6cd9..ce411b5e8f06 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2959,7 +2959,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
;
; SSE41-LABEL: pr38658:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
@@ -2984,7 +2984,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
;
; AVX1-LABEL: pr38658:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
@@ -3058,7 +3058,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
;
; XOP-LABEL: pr38658:
; XOP: # %bb.0:
-; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; XOP-NEXT: vpmovsxbw %xmm1, %xmm1
; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index a6950873daf5..383d1866aa1d 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -311,7 +311,7 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -344,7 +344,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -377,7 +377,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0
; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index e2f3f2dc7523..28a73cdb6a41 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -152,7 +152,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psrad %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: psrad %xmm4, %xmm5
@@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psrld %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: psrld %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 36fbdaf53702..2e886defafd4 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -400,7 +400,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psrld %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: psrld %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index e6d7aac92616..c44342d00357 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -233,7 +233,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm3, %xmm4
@@ -249,7 +249,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
@@ -307,7 +307,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm3, %xmm4
@@ -324,7 +324,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
@@ -384,7 +384,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm3, %xmm4
@@ -401,7 +401,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index b21ed8ec60ce..cd0b21d02969 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -213,7 +213,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrld %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm3, %xmm6
; SSE-NEXT: psrld %xmm5, %xmm6
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 753aee85f319..59101503b5a9 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -565,9 +565,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X86-NEXT: cltd
; X86-NEXT: idivl %esi
; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT: movd %xmm3, %eax
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X86-NEXT: movd %xmm3, %esi
; X86-NEXT: cltd
; X86-NEXT: idivl %esi
@@ -608,9 +608,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X64-NEXT: cltd
; X64-NEXT: idivl %ecx
; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movd %xmm3, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movd %xmm3, %ecx
; X64-NEXT: cltd
; X64-NEXT: idivl %ecx
@@ -657,11 +657,11 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: movd %xmm1, (%esp)
; X86-NEXT: calll __divdi3
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -707,9 +707,9 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
; X64-NEXT: movq %rax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm3, %rax
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm3, %rcx
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 43fc23c836a9..d787f91ababb 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -565,9 +565,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT: movd %xmm3, %eax
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X86-NEXT: movd %xmm3, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
@@ -608,9 +608,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movd %xmm3, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movd %xmm3, %ecx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
@@ -657,11 +657,11 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-NEXT: movd %xmm1, (%esp)
; X86-NEXT: calll __udivdi3
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -707,9 +707,9 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm3, %rax
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm3, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll
index c29fac61af3f..d50c2ed92075 100644
--- a/llvm/test/CodeGen/X86/extract-store.ll
+++ b/llvm/test/CodeGen/X86/extract-store.ll
@@ -314,7 +314,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind {
;
; SSE2-X64-LABEL: extract_i64_1:
; SSE2-X64: # %bb.0:
-; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-X64-NEXT: movq %xmm0, (%rdi)
; SSE2-X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll
index cf06f8dcb13e..f2e01e93361e 100644
--- a/llvm/test/CodeGen/X86/extractelement-index.ll
+++ b/llvm/test/CodeGen/X86/extractelement-index.ll
@@ -351,7 +351,7 @@ define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v2i64_1:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
@@ -371,7 +371,7 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v4i64_1:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
@@ -392,7 +392,7 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v4i64_3:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 94628c70d989..b694859b757c 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -10,13 +10,13 @@ define i32 @t(<2 x i64>* %val) nounwind {
; X32-SSE2-LABEL: t:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: pshufd $78, (%eax), %xmm0 # xmm0 = mem[2,3,0,1]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
; X32-SSE2-NEXT: movd %xmm0, %eax
; X32-SSE2-NEXT: retl
;
; X64-SSSE3-LABEL: t:
; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: pshufd $78, (%rdi), %xmm0 # xmm0 = mem[2,3,0,1]
+; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
; X64-SSSE3-NEXT: movd %xmm0, %eax
; X64-SSSE3-NEXT: retq
;
@@ -60,13 +60,13 @@ define void @t3(<2 x double>* %a0) {
;
; X64-SSSE3-LABEL: t3:
; X64-SSSE3: # %bb.0: # %bb
-; X64-SSSE3-NEXT: movsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero
+; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-SSSE3-NEXT: movsd %xmm0, (%rax)
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: t3:
; X64-AVX: # %bb.0: # %bb
-; X64-AVX-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-AVX-NEXT: vmovsd %xmm0, (%rax)
; X64-AVX-NEXT: retq
bb:
@@ -139,7 +139,7 @@ define float @t6(<8 x float> *%a0) {
; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1
-; X32-SSE2-NEXT: movss {{\.LCPI.*}}, %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE2-NEXT: andps %xmm1, %xmm2
; X32-SSE2-NEXT: andnps %xmm0, %xmm1
; X32-SSE2-NEXT: orps %xmm2, %xmm1
@@ -151,10 +151,10 @@ define float @t6(<8 x float> *%a0) {
;
; X64-SSSE3-LABEL: t6:
; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movshdup (%rdi), %xmm1 # xmm1 = mem[1,1,3,3]
+; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
; X64-SSSE3-NEXT: xorps %xmm0, %xmm0
; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0
-; X64-SSSE3-NEXT: movss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSSE3-NEXT: andps %xmm0, %xmm2
; X64-SSSE3-NEXT: andnps %xmm1, %xmm0
; X64-SSSE3-NEXT: orps %xmm2, %xmm0
@@ -162,10 +162,10 @@ define float @t6(<8 x float> *%a0) {
;
; X64-AVX-LABEL: t6:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
-; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; X64-AVX-NEXT: retq
%vecload = load <8 x float>, <8 x float>* %a0, align 32
@@ -184,7 +184,7 @@ define void @PR43971(<8 x float> *%a0, float *%a1) {
; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1
-; X32-SSE2-NEXT: movss (%eax), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE2-NEXT: andps %xmm1, %xmm2
; X32-SSE2-NEXT: andnps %xmm0, %xmm1
; X32-SSE2-NEXT: orps %xmm2, %xmm1
@@ -193,10 +193,10 @@ define void @PR43971(<8 x float> *%a0, float *%a1) {
;
; X64-SSSE3-LABEL: PR43971:
; X64-SSSE3: # %bb.0: # %entry
-; X64-SSSE3-NEXT: movss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSSE3-NEXT: xorps %xmm1, %xmm1
; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1
-; X64-SSSE3-NEXT: movss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSSE3-NEXT: andps %xmm1, %xmm2
; X64-SSSE3-NEXT: andnps %xmm0, %xmm1
; X64-SSSE3-NEXT: orps %xmm2, %xmm1
@@ -205,10 +205,10 @@ define void @PR43971(<8 x float> *%a0, float *%a1) {
;
; X64-AVX-LABEL: PR43971:
; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: vmovss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; X64-AVX-NEXT: vmovss %xmm0, (%rsi)
; X64-AVX-NEXT: retq
@@ -231,7 +231,7 @@ define float @PR43971_1(<8 x float> *%a0) nounwind {
; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1
-; X32-SSE2-NEXT: movss {{\.LCPI.*}}, %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE2-NEXT: andps %xmm1, %xmm2
; X32-SSE2-NEXT: andnps %xmm0, %xmm1
; X32-SSE2-NEXT: orps %xmm2, %xmm1
@@ -242,10 +242,10 @@ define float @PR43971_1(<8 x float> *%a0) nounwind {
;
; X64-SSSE3-LABEL: PR43971_1:
; X64-SSSE3: # %bb.0: # %entry
-; X64-SSSE3-NEXT: movshdup (%rdi), %xmm1 # xmm1 = mem[1,1,3,3]
+; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
; X64-SSSE3-NEXT: xorps %xmm0, %xmm0
; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0
-; X64-SSSE3-NEXT: movss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSSE3-NEXT: andps %xmm0, %xmm2
; X64-SSSE3-NEXT: andnps %xmm1, %xmm0
; X64-SSSE3-NEXT: orps %xmm2, %xmm0
@@ -253,10 +253,10 @@ define float @PR43971_1(<8 x float> *%a0) nounwind {
;
; X64-AVX-LABEL: PR43971_1:
; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
-; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; X64-AVX-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll
index 1cd85e6e582c..00f84a6e4b15 100644
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@@ -17,7 +17,7 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; LIN-SSE2-NEXT: movd %xmm0, %eax
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; LIN-SSE2-NEXT: movd %xmm1, %ecx
-; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; LIN-SSE2-NEXT: movd %xmm1, %edx
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; LIN-SSE2-NEXT: movd %xmm0, %esi
@@ -56,7 +56,7 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; WIN-SSE2-NEXT: movd %xmm0, %r8d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; WIN-SSE2-NEXT: movd %xmm1, %r9d
-; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; WIN-SSE2-NEXT: movd %xmm1, %r10d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; WIN-SSE2-NEXT: movd %xmm0, %edx
@@ -141,7 +141,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
; LIN-SSE2-NEXT: movd %xmm0, %eax
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; LIN-SSE2-NEXT: movd %xmm1, %edx
-; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; LIN-SSE2-NEXT: movd %xmm1, %esi
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; LIN-SSE2-NEXT: movd %xmm0, %edi
@@ -184,7 +184,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
; WIN-SSE2-NEXT: movd %xmm0, %eax
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; WIN-SSE2-NEXT: movd %xmm1, %ecx
-; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; WIN-SSE2-NEXT: movd %xmm1, %r8d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; WIN-SSE2-NEXT: movd %xmm0, %edx
diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll
index c4d470a6cd69..e36c0479448e 100644
--- a/llvm/test/CodeGen/X86/haddsub-2.ll
+++ b/llvm/test/CodeGen/X86/haddsub-2.ll
@@ -127,7 +127,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
@@ -136,7 +136,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %eax, %esi
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
@@ -181,7 +181,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phadd_d_test2:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
@@ -192,7 +192,7 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: movd %esi, %xmm0
@@ -243,7 +243,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: subl %ecx, %eax
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
@@ -252,7 +252,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: subl %esi, %edx
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
@@ -297,7 +297,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phsub_d_test2:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
@@ -306,7 +306,7 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: subl %edx, %ecx
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
@@ -513,7 +513,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm4, %r8d
; SSE3-NEXT: addl %ecx, %r8d
-; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm4, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %r9d
@@ -522,7 +522,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %edx, %esi
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
@@ -531,7 +531,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %r10d
; SSE3-NEXT: addl %eax, %r10d
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %ecx
@@ -540,7 +540,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %r11d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
@@ -819,7 +819,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE-NEXT: movd %xmm2, %ecx
; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE-NEXT: movd %xmm2, %ecx
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: movd %xmm0, %edx
@@ -830,7 +830,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
; SSE-NEXT: subl %esi, %edx
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE-NEXT: movd %xmm0, %esi
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: movd %xmm0, %edi
; SSE-NEXT: subl %edi, %esi
; SSE-NEXT: movd %esi, %xmm0
@@ -1133,7 +1133,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm4, %r8d
; SSE3-NEXT: addl %ecx, %r8d
-; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm4, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %r9d
@@ -1142,7 +1142,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %edx, %esi
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
@@ -1151,7 +1151,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %r10d
; SSE3-NEXT: addl %eax, %r10d
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %ecx
@@ -1160,7 +1160,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE3-NEXT: movd %xmm0, %r11d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 6663459f49d5..740e4b291f5f 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -554,7 +554,7 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrld %xmm2, %xmm5
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrld %xmm2, %xmm4
@@ -640,7 +640,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrld %xmm2, %xmm5
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrld %xmm2, %xmm4
@@ -677,7 +677,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrld %xmm2, %xmm5
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrld %xmm2, %xmm4
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
index 64d8de9aead7..dab7785c85cc 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
@@ -11,7 +11,7 @@
define i32 @PR37890_v4i32(<4 x i32> %a) {
; SSE2-LABEL: PR37890_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -20,7 +20,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) {
;
; SSSE3-SLOW-LABEL: PR37890_v4i32:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -36,7 +36,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) {
;
; AVX1-SLOW-LABEL: PR37890_v4i32:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -52,7 +52,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) {
;
; AVX2-LABEL: PR37890_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -70,7 +70,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) {
define i16 @PR37890_v8i16(<8 x i16> %a) {
; SSE2-LABEL: PR37890_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddw %xmm1, %xmm0
@@ -83,7 +83,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) {
;
; SSSE3-SLOW-LABEL: PR37890_v8i16:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
@@ -105,7 +105,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) {
;
; AVX1-SLOW-LABEL: PR37890_v8i16:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -126,7 +126,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) {
;
; AVX2-LABEL: PR37890_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -151,7 +151,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) {
; SSE2-LABEL: PR37890_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -161,7 +161,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) {
; SSSE3-SLOW-LABEL: PR37890_v8i32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -180,7 +180,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) {
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -202,7 +202,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -225,7 +225,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) {
; SSE2-LABEL: PR37890_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddw %xmm1, %xmm0
@@ -239,7 +239,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) {
; SSSE3-SLOW-LABEL: PR37890_v16i16:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
@@ -264,7 +264,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) {
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -291,7 +291,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -322,7 +322,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) {
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -334,7 +334,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) {
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
@@ -346,7 +346,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) {
; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1
; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
; SSSE3-FAST-NEXT: movd %xmm0, %eax
@@ -359,7 +359,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) {
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -385,7 +385,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
index dc8f60248c67..a17b1db2c178 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -16,7 +16,7 @@
define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_reduce_v2i64:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE42-LABEL: test_reduce_v2i64:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; X86-SSE42-NEXT: movd %xmm2, %eax
@@ -49,7 +49,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X86-AVX-LABEL: test_reduce_v2i64:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
@@ -58,7 +58,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v2i64:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -80,7 +80,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-SSE42-LABEL: test_reduce_v2i64:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; X64-SSE42-NEXT: movq %xmm2, %rax
@@ -88,7 +88,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX1-LABEL: test_reduce_v2i64:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
@@ -96,7 +96,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX2-LABEL: test_reduce_v2i64:
; X64-AVX2: ## %bb.0:
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
@@ -104,7 +104,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX512-LABEL: test_reduce_v2i64:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: retq
@@ -118,7 +118,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
define i32 @test_reduce_v4i32(<4 x i32> %a0) {
; X86-SSE2-LABEL: test_reduce_v4i32:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -135,7 +135,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v4i32:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
@@ -144,7 +144,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-AVX-LABEL: test_reduce_v4i32:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -153,7 +153,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v4i32:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -170,7 +170,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v4i32:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
@@ -179,7 +179,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-AVX-LABEL: test_reduce_v4i32:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -198,7 +198,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
define i16 @test_reduce_v8i16(<8 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -229,7 +229,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -273,7 +273,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
define i8 @test_reduce_v16i8(<16 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -328,7 +328,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -419,7 +419,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: por %xmm0, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -444,7 +444,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -457,7 +457,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
@@ -470,7 +470,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
@@ -496,7 +496,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm4
; X64-SSE2-NEXT: por %xmm0, %xmm4
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -519,7 +519,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -531,7 +531,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
@@ -543,7 +543,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
@@ -554,7 +554,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -577,7 +577,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: por %xmm0, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; X86-SSE2-NEXT: pand %xmm1, %xmm2
@@ -595,7 +595,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE42-LABEL: test_reduce_v8i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
@@ -606,7 +606,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -618,7 +618,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -633,7 +633,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm2
; X64-SSE2-NEXT: por %xmm0, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; X64-SSE2-NEXT: pand %xmm1, %xmm2
@@ -651,7 +651,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE42-LABEL: test_reduce_v8i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
@@ -662,7 +662,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -674,7 +674,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -686,7 +686,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -710,7 +710,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -758,7 +758,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X64-SSE2-LABEL: test_reduce_v16i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -838,7 +838,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: por %xmm0, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; X86-SSE2-NEXT: pand %xmm1, %xmm2
@@ -916,7 +916,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm2
; X64-SSE2-NEXT: por %xmm0, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; X64-SSE2-NEXT: pand %xmm1, %xmm2
@@ -1072,7 +1072,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm1, %xmm5
; X86-SSE2-NEXT: pandn %xmm0, %xmm1
; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm2
; X86-SSE2-NEXT: pxor %xmm0, %xmm4
@@ -1104,7 +1104,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE42-NEXT: movapd %xmm2, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
@@ -1122,7 +1122,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
@@ -1137,7 +1137,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
@@ -1193,7 +1193,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm5
; X64-SSE2-NEXT: pandn %xmm2, %xmm1
; X64-SSE2-NEXT: por %xmm5, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm2
; X64-SSE2-NEXT: pxor %xmm0, %xmm4
@@ -1223,7 +1223,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE42-NEXT: movapd %xmm2, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
@@ -1240,7 +1240,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
@@ -1254,7 +1254,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
@@ -1267,7 +1267,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -1303,7 +1303,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm0, %xmm1
; X86-SSE2-NEXT: pandn %xmm4, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -1323,7 +1323,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm1
; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
@@ -1337,7 +1337,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1350,7 +1350,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1375,7 +1375,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm0, %xmm1
; X64-SSE2-NEXT: pandn %xmm4, %xmm0
; X64-SSE2-NEXT: por %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -1395,7 +1395,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm1
; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
@@ -1409,7 +1409,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1422,7 +1422,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1436,7 +1436,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1465,7 +1465,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1
; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm1
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
@@ -1521,7 +1521,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1
; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm1
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
@@ -1622,7 +1622,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE2-NEXT: pand %xmm0, %xmm1
; X86-SSE2-NEXT: pandn %xmm4, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -1716,7 +1716,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE2-NEXT: pand %xmm0, %xmm1
; X64-SSE2-NEXT: pandn %xmm4, %xmm0
; X64-SSE2-NEXT: por %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -1837,7 +1837,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1869,7 +1869,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1914,7 +1914,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1946,7 +1946,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1991,7 +1991,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2047,7 +2047,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2119,7 +2119,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2175,7 +2175,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
index 987da0f68082..17c3a9fd4a01 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -16,7 +16,7 @@
define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_reduce_v2i64:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE42-LABEL: test_reduce_v2i64:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -50,7 +50,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X86-AVX-LABEL: test_reduce_v2i64:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
@@ -59,7 +59,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v2i64:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -81,7 +81,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-SSE42-LABEL: test_reduce_v2i64:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -90,7 +90,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX1-LABEL: test_reduce_v2i64:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
@@ -98,7 +98,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX2-LABEL: test_reduce_v2i64:
; X64-AVX2: ## %bb.0:
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
@@ -106,7 +106,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX512-LABEL: test_reduce_v2i64:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: retq
@@ -120,7 +120,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
define i32 @test_reduce_v4i32(<4 x i32> %a0) {
; X86-SSE2-LABEL: test_reduce_v4i32:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -137,7 +137,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v4i32:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
@@ -146,7 +146,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-AVX-LABEL: test_reduce_v4i32:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -155,7 +155,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v4i32:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -172,7 +172,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v4i32:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
@@ -181,7 +181,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-AVX-LABEL: test_reduce_v4i32:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -200,7 +200,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
define i16 @test_reduce_v8i16(<8 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -231,7 +231,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -275,7 +275,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
define i8 @test_reduce_v16i8(<16 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -330,7 +330,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -421,7 +421,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: por %xmm0, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -447,7 +447,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -460,7 +460,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
@@ -473,7 +473,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
@@ -499,7 +499,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm4
; X64-SSE2-NEXT: por %xmm0, %xmm4
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -523,7 +523,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -535,7 +535,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
@@ -547,7 +547,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
@@ -558,7 +558,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -581,7 +581,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: por %xmm0, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; X86-SSE2-NEXT: pand %xmm1, %xmm2
@@ -599,7 +599,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE42-LABEL: test_reduce_v8i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
@@ -610,7 +610,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -622,7 +622,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -637,7 +637,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm2
; X64-SSE2-NEXT: por %xmm0, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; X64-SSE2-NEXT: pand %xmm1, %xmm2
@@ -655,7 +655,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE42-LABEL: test_reduce_v8i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
@@ -666,7 +666,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -678,7 +678,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -690,7 +690,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -714,7 +714,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -762,7 +762,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X64-SSE2-LABEL: test_reduce_v16i16:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -842,7 +842,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: por %xmm0, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
; X86-SSE2-NEXT: pand %xmm1, %xmm2
@@ -920,7 +920,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm2
; X64-SSE2-NEXT: por %xmm0, %xmm2
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
; X64-SSE2-NEXT: pand %xmm1, %xmm2
@@ -1076,7 +1076,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm0, %xmm1
; X86-SSE2-NEXT: pandn %xmm5, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm2
; X86-SSE2-NEXT: pxor %xmm1, %xmm4
@@ -1108,7 +1108,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE42-NEXT: movapd %xmm3, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
@@ -1126,7 +1126,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
@@ -1141,7 +1141,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
@@ -1197,7 +1197,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm3
; X64-SSE2-NEXT: pandn %xmm5, %xmm1
; X64-SSE2-NEXT: por %xmm3, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm2
; X64-SSE2-NEXT: pxor %xmm0, %xmm4
@@ -1227,7 +1227,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE42-NEXT: movapd %xmm3, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
@@ -1244,7 +1244,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
@@ -1258,7 +1258,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
@@ -1271,7 +1271,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -1307,7 +1307,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm1, %xmm4
; X86-SSE2-NEXT: pandn %xmm0, %xmm1
; X86-SSE2-NEXT: por %xmm4, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm1
@@ -1327,7 +1327,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE42-NEXT: pminsd %xmm3, %xmm1
; X86-SSE42-NEXT: pminsd %xmm2, %xmm1
; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
@@ -1341,7 +1341,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1354,7 +1354,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1379,7 +1379,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm4
; X64-SSE2-NEXT: pandn %xmm0, %xmm1
; X64-SSE2-NEXT: por %xmm4, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm1
@@ -1399,7 +1399,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE42-NEXT: pminsd %xmm3, %xmm1
; X64-SSE42-NEXT: pminsd %xmm2, %xmm1
; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
@@ -1413,7 +1413,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1426,7 +1426,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1440,7 +1440,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1469,7 +1469,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X86-SSE2-NEXT: pminsw %xmm3, %xmm1
; X86-SSE2-NEXT: pminsw %xmm2, %xmm1
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
@@ -1525,7 +1525,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X64-SSE2-NEXT: pminsw %xmm3, %xmm1
; X64-SSE2-NEXT: pminsw %xmm2, %xmm1
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
@@ -1626,7 +1626,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE2-NEXT: pand %xmm1, %xmm4
; X86-SSE2-NEXT: pandn %xmm0, %xmm1
; X86-SSE2-NEXT: por %xmm4, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm1
@@ -1720,7 +1720,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm4
; X64-SSE2-NEXT: pandn %xmm0, %xmm1
; X64-SSE2-NEXT: por %xmm4, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm1
@@ -1841,7 +1841,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1873,7 +1873,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1918,7 +1918,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1950,7 +1950,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1995,7 +1995,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2051,7 +2051,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2123,7 +2123,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; X86-SSE2-NEXT: pand %xmm2, %xmm0
@@ -2179,7 +2179,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; X64-SSE2-NEXT: pand %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index 6e59cd046cb0..c69551aa3d88 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -16,7 +16,7 @@
define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_reduce_v2i64:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE42-LABEL: test_reduce_v2i64:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
; X86-SSE42-NEXT: pxor %xmm3, %xmm0
; X86-SSE42-NEXT: pxor %xmm2, %xmm3
@@ -52,7 +52,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X86-AVX1-LABEL: test_reduce_v2i64:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
@@ -65,7 +65,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X86-AVX2-LABEL: test_reduce_v2i64:
; X86-AVX2: ## %bb.0:
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -77,7 +77,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v2i64:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -99,7 +99,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-SSE42-LABEL: test_reduce_v2i64:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; X64-SSE42-NEXT: pxor %xmm3, %xmm0
; X64-SSE42-NEXT: pxor %xmm2, %xmm3
@@ -110,7 +110,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX1-LABEL: test_reduce_v2i64:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -121,7 +121,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX2-LABEL: test_reduce_v2i64:
; X64-AVX2: ## %bb.0:
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -132,7 +132,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX512-LABEL: test_reduce_v2i64:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: retq
@@ -146,7 +146,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
define i32 @test_reduce_v4i32(<4 x i32> %a0) {
; X86-SSE2-LABEL: test_reduce_v4i32:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -169,7 +169,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v4i32:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
@@ -178,7 +178,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-AVX-LABEL: test_reduce_v4i32:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -187,7 +187,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v4i32:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -210,7 +210,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v4i32:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
@@ -219,7 +219,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-AVX-LABEL: test_reduce_v4i32:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -238,7 +238,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
define i16 @test_reduce_v8i16(<8 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -275,7 +275,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -344,7 +344,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
define i8 @test_reduce_v16i8(<16 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -385,7 +385,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -485,7 +485,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: por %xmm0, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -514,7 +514,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE42-NEXT: pxor %xmm3, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: pxor %xmm3, %xmm0
; X86-SSE42-NEXT: pxor %xmm2, %xmm3
@@ -533,7 +533,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -551,7 +551,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -579,7 +579,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm4
; X64-SSE2-NEXT: por %xmm0, %xmm4
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -606,7 +606,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE42-NEXT: pxor %xmm3, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: pxor %xmm3, %xmm0
; X64-SSE42-NEXT: pxor %xmm2, %xmm3
@@ -623,7 +623,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -640,7 +640,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -653,7 +653,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -680,7 +680,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: por %xmm0, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -703,7 +703,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE42-LABEL: test_reduce_v8i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
@@ -714,7 +714,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -726,7 +726,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -745,7 +745,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm4
; X64-SSE2-NEXT: por %xmm0, %xmm4
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -768,7 +768,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE42-LABEL: test_reduce_v8i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
@@ -779,7 +779,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -791,7 +791,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -803,7 +803,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -830,7 +830,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -885,7 +885,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -965,7 +965,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i8:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -1026,7 +1026,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE2-LABEL: test_reduce_v32i8:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -1169,7 +1169,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm1, %xmm5
; X86-SSE2-NEXT: pandn %xmm0, %xmm1
; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm2
; X86-SSE2-NEXT: pxor %xmm0, %xmm4
@@ -1210,7 +1210,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE42-NEXT: xorpd %xmm5, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
; X86-SSE42-NEXT: pxor %xmm5, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm5
@@ -1238,7 +1238,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1260,7 +1260,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1318,7 +1318,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm5
; X64-SSE2-NEXT: pandn %xmm2, %xmm1
; X64-SSE2-NEXT: por %xmm5, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm2
; X64-SSE2-NEXT: pxor %xmm0, %xmm4
@@ -1357,7 +1357,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE42-NEXT: xorpd %xmm5, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
; X64-SSE42-NEXT: pxor %xmm5, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm5
@@ -1383,7 +1383,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1404,7 +1404,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1419,7 +1419,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -1465,7 +1465,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm1, %xmm5
; X86-SSE2-NEXT: pandn %xmm2, %xmm1
; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1490,7 +1490,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxud %xmm2, %xmm1
; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
@@ -1504,7 +1504,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1517,7 +1517,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1552,7 +1552,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm5
; X64-SSE2-NEXT: pandn %xmm2, %xmm1
; X64-SSE2-NEXT: por %xmm5, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm2
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1577,7 +1577,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxud %xmm2, %xmm1
; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
@@ -1591,7 +1591,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1604,7 +1604,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1618,7 +1618,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1652,7 +1652,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm0
; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1717,7 +1717,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm0
; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1810,7 +1810,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1
; X86-SSE2-NEXT: pmaxub %xmm2, %xmm1
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
@@ -1879,7 +1879,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1
; X64-SSE2-NEXT: pmaxub %xmm2, %xmm1
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
@@ -1987,7 +1987,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -2025,7 +2025,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -2097,7 +2097,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -2135,7 +2135,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -2207,7 +2207,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -2249,7 +2249,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -2332,7 +2332,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -2374,7 +2374,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index 413b5f2ac4aa..5f33520200d2 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -16,7 +16,7 @@
define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE2-LABEL: test_reduce_v2i64:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X86-SSE42-LABEL: test_reduce_v2i64:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
; X86-SSE42-NEXT: movdqa %xmm1, %xmm3
; X86-SSE42-NEXT: pxor %xmm0, %xmm3
@@ -53,7 +53,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X86-AVX1-LABEL: test_reduce_v2i64:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
@@ -66,7 +66,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X86-AVX2-LABEL: test_reduce_v2i64:
; X86-AVX2: ## %bb.0:
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -78,7 +78,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v2i64:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -100,7 +100,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-SSE42-LABEL: test_reduce_v2i64:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; X64-SSE42-NEXT: movdqa %xmm1, %xmm3
; X64-SSE42-NEXT: pxor %xmm0, %xmm3
@@ -112,7 +112,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX1-LABEL: test_reduce_v2i64:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -123,7 +123,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX2-LABEL: test_reduce_v2i64:
; X64-AVX2: ## %bb.0:
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -134,7 +134,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
;
; X64-AVX512-LABEL: test_reduce_v2i64:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: retq
@@ -148,7 +148,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
define i32 @test_reduce_v4i32(<4 x i32> %a0) {
; X86-SSE2-LABEL: test_reduce_v4i32:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -171,7 +171,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v4i32:
; X86-SSE42: ## %bb.0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pminud %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminud %xmm1, %xmm0
@@ -180,7 +180,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X86-AVX-LABEL: test_reduce_v4i32:
; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -189,7 +189,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v4i32:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: pxor %xmm2, %xmm3
@@ -212,7 +212,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v4i32:
; X64-SSE42: ## %bb.0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pminud %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminud %xmm1, %xmm0
@@ -221,7 +221,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
;
; X64-AVX-LABEL: test_reduce_v4i32:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -240,7 +240,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
define i16 @test_reduce_v8i16(<8 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -271,7 +271,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -315,7 +315,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
define i8 @test_reduce_v16i8(<16 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -350,7 +350,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -421,7 +421,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: por %xmm0, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -451,7 +451,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-SSE42-NEXT: pxor %xmm3, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: pxor %xmm3, %xmm0
; X86-SSE42-NEXT: pxor %xmm2, %xmm3
@@ -471,7 +471,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
@@ -489,7 +489,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -517,7 +517,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm4
; X64-SSE2-NEXT: por %xmm0, %xmm4
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: pxor %xmm0, %xmm2
@@ -545,7 +545,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-SSE42-NEXT: pxor %xmm3, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: pxor %xmm3, %xmm0
; X64-SSE42-NEXT: pxor %xmm2, %xmm3
@@ -563,7 +563,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
; X64-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
@@ -580,7 +580,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -593,7 +593,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -620,7 +620,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm4, %xmm0
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: por %xmm0, %xmm4
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -643,7 +643,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-SSE42-LABEL: test_reduce_v8i32:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminud %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pminud %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminud %xmm1, %xmm0
@@ -654,7 +654,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -666,7 +666,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -685,7 +685,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm4, %xmm0
; X64-SSE2-NEXT: pandn %xmm1, %xmm4
; X64-SSE2-NEXT: por %xmm0, %xmm4
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -708,7 +708,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-SSE42-LABEL: test_reduce_v8i32:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminud %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pminud %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminud %xmm1, %xmm0
@@ -719,7 +719,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -731,7 +731,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -743,7 +743,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -770,7 +770,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -816,7 +816,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -885,7 +885,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i8:
; X86-SSE2: ## %bb.0:
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -937,7 +937,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE2-LABEL: test_reduce_v32i8:
; X64-SSE2: ## %bb.0:
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -1069,7 +1069,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE2-NEXT: pand %xmm0, %xmm1
; X86-SSE2-NEXT: pandn %xmm5, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm2
; X86-SSE2-NEXT: pxor %xmm1, %xmm4
@@ -1111,7 +1111,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-SSE42-NEXT: xorpd %xmm4, %xmm0
; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
; X86-SSE42-NEXT: pxor %xmm4, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm4
@@ -1140,7 +1140,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1162,7 +1162,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -1220,7 +1220,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm3
; X64-SSE2-NEXT: pandn %xmm5, %xmm1
; X64-SSE2-NEXT: por %xmm3, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm2
; X64-SSE2-NEXT: pxor %xmm0, %xmm4
@@ -1260,7 +1260,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-SSE42-NEXT: xorpd %xmm4, %xmm0
; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
; X64-SSE42-NEXT: pxor %xmm4, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm4
@@ -1287,7 +1287,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1308,7 +1308,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -1323,7 +1323,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: vzeroupper
@@ -1369,7 +1369,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE2-NEXT: pand %xmm1, %xmm3
; X86-SSE2-NEXT: pandn %xmm6, %xmm1
; X86-SSE2-NEXT: por %xmm3, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1394,7 +1394,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-SSE42-NEXT: pminud %xmm3, %xmm1
; X86-SSE42-NEXT: pminud %xmm2, %xmm1
; X86-SSE42-NEXT: pminud %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE42-NEXT: pminud %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE42-NEXT: pminud %xmm0, %xmm1
@@ -1408,7 +1408,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1421,7 +1421,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1456,7 +1456,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE2-NEXT: pand %xmm1, %xmm3
; X64-SSE2-NEXT: pandn %xmm6, %xmm1
; X64-SSE2-NEXT: por %xmm3, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm2
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1481,7 +1481,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-SSE42-NEXT: pminud %xmm3, %xmm1
; X64-SSE42-NEXT: pminud %xmm2, %xmm1
; X64-SSE42-NEXT: pminud %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE42-NEXT: pminud %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE42-NEXT: pminud %xmm0, %xmm1
@@ -1495,7 +1495,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1508,7 +1508,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1522,7 +1522,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1556,7 +1556,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X86-SSE2-NEXT: pminsw %xmm1, %xmm2
; X86-SSE2-NEXT: pxor %xmm4, %xmm0
; X86-SSE2-NEXT: pminsw %xmm2, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1612,7 +1612,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
; X64-SSE2-NEXT: pminsw %xmm1, %xmm2
; X64-SSE2-NEXT: pxor %xmm4, %xmm0
; X64-SSE2-NEXT: pminsw %xmm2, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1694,7 +1694,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE2-NEXT: pminub %xmm3, %xmm1
; X86-SSE2-NEXT: pminub %xmm2, %xmm1
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
@@ -1754,7 +1754,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE2-NEXT: pminub %xmm3, %xmm1
; X64-SSE2-NEXT: pminub %xmm2, %xmm1
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
@@ -1851,7 +1851,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -1883,7 +1883,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v16i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -1928,7 +1928,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X86-SSE2-NEXT: pxor %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -1960,7 +1960,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i16_v8i16:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; X64-SSE2-NEXT: pxor %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm2, %xmm1
@@ -2005,7 +2005,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -2041,7 +2041,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v32i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -2093,7 +2093,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X86-SSE2: ## %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: pminub %xmm0, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE2-NEXT: pminub %xmm1, %xmm0
@@ -2129,7 +2129,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
;
; X64-SSE2-LABEL: test_reduce_v64i8_v16i8:
; X64-SSE2: ## %bb.0:
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: pminub %xmm0, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE2-NEXT: pminub %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index 8e75c4a575ee..d128d75e6457 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -77,7 +77,7 @@ define <1 x i128> @add_v1i128(<1 x i128> %x, <1 x i128> %y) nounwind {
; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: movq %rsi, %xmm1
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: addq $1, %rax
; X64-NEXT: adcq $0, %rdx
diff --git a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll
index 552ce96a53a5..7aee1d175494 100644
--- a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll
@@ -16,7 +16,7 @@ define { i64, i64 } @foo(i64 %0, i64 %1) {
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: movq %xmm0, %rdx
; CHECK-NEXT: retq
%3 = zext i64 %1 to i128
diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll
index a1606e93e2e2..3b6912a9d946 100644
--- a/llvm/test/CodeGen/X86/known-bits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-bits-vector.ll
@@ -116,14 +116,14 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_undef_sext(<8 x i16> %a0) nounw
; X32-LABEL: knownbits_mask_shuffle_shuffle_undef_sext:
; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-NEXT: vpmovsxwd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_shuffle_shuffle_undef_sext:
; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NEXT: vpmovsxwd %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <8 x i16> %a0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 15, i16 15, i16 15, i16 15>
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index b18b8079fd23..6c4d0a919ef0 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -252,7 +252,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
; X86-LABEL: signbits_sext_shuffle_sitofp:
; X86: # %bb.0:
; X86-NEXT: vpmovsxdq %xmm0, %xmm1
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X86-NEXT: vpmovsxdq %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
@@ -264,7 +264,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
@@ -478,7 +478,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
; X64-AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X64-AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X64-AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm4
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 7bde3facc735..93097e2b98fb 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -25,7 +25,7 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB0_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -48,7 +48,7 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read
; AVX-NEXT: cmpq %rcx, %rax
; AVX-NEXT: jne .LBB0_1
; AVX-NEXT: # %bb.2: # %middle.block
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -103,7 +103,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -128,7 +128,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -153,7 +153,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -218,7 +218,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -251,7 +251,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -278,7 +278,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -305,7 +305,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -386,7 +386,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -433,7 +433,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -466,7 +466,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -498,7 +498,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -527,7 +527,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -593,7 +593,7 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB4_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -616,7 +616,7 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly
; AVX-NEXT: cmpq %rcx, %rax
; AVX-NEXT: jne .LBB4_1
; AVX-NEXT: # %bb.2: # %middle.block
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -675,7 +675,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
; SSE2-NEXT: jne .LBB5_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -701,7 +701,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -727,7 +727,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -798,7 +798,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -833,7 +833,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -861,7 +861,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -889,7 +889,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -982,7 +982,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -1033,7 +1033,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1068,7 +1068,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1102,7 +1102,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1132,7 +1132,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1195,7 +1195,7 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB8_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -1218,7 +1218,7 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read
; AVX-NEXT: cmpq %rcx, %rax
; AVX-NEXT: jne .LBB8_1
; AVX-NEXT: # %bb.2: # %middle.block
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1279,7 +1279,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: jne .LBB9_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -1310,7 +1310,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1336,7 +1336,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1414,7 +1414,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -1459,7 +1459,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1491,7 +1491,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1519,7 +1519,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1627,7 +1627,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; SSE2-NEXT: paddd %xmm5, %xmm9
; SSE2-NEXT: paddd %xmm10, %xmm9
; SSE2-NEXT: paddd %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
; SSE2-NEXT: paddd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -1700,7 +1700,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1744,7 +1744,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1778,7 +1778,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2074,7 +2074,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
;
; AVX1-LABEL: pmaddwd_negative2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
@@ -2647,7 +2647,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
; SSE2-NEXT: movdqu (%rcx), %xmm2
; SSE2-NEXT: pmaddwd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -2661,7 +2661,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2708,7 +2708,7 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
; SSE2-NEXT: pmaddwd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -2730,7 +2730,7 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2798,13 +2798,13 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) {
; SSE2-NEXT: jne .LBB33_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: movd %xmm2, %ecx
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -2839,14 +2839,14 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) {
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm1, %ecx
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2874,14 +2874,14 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) {
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX256-NEXT: vmovd %xmm1, %ecx
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2953,7 +2953,7 @@ define i32 @sum_of_square_
diff erences(i8* %a, i8* %b, i32 %n) {
; SSE2-NEXT: jne .LBB34_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -2980,7 +2980,7 @@ define i32 @sum_of_square_
diff erences(i8* %a, i8* %b, i32 %n) {
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -3007,7 +3007,7 @@ define i32 @sum_of_square_
diff erences(i8* %a, i8* %b, i32 %n) {
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 24ae0c77af2f..c93543ddda7a 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -2478,7 +2478,7 @@ define void @compressstore_v2i64_v2i1(i64* %base, <2 x i64> %V, <2 x i1> %mask)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB7_4
; SSE2-NEXT: LBB7_3: ## %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: retq
;
@@ -2574,7 +2574,7 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB8_4
; SSE2-NEXT: LBB8_3: ## %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $4, %al
@@ -2585,7 +2585,7 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB8_8
; SSE2-NEXT: LBB8_7: ## %cond.store7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: retq
;
@@ -2762,7 +2762,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB9_4
; SSE2-NEXT: LBB9_3: ## %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $4, %al
@@ -2773,7 +2773,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB9_8
; SSE2-NEXT: LBB9_7: ## %cond.store7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $16, %al
@@ -2784,7 +2784,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask)
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB9_12
; SSE2-NEXT: LBB9_11: ## %cond.store13
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $64, %al
@@ -2795,7 +2795,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB9_16
; SSE2-NEXT: LBB9_15: ## %cond.store19
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: retq
;
@@ -3068,7 +3068,7 @@ define void @compressstore_v4i32_v4i32(i32* %base, <4 x i32> %V, <4 x i32> %trig
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB10_6
; SSE2-NEXT: LBB10_5: ## %cond.store4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm1, (%rdi)
; SSE2-NEXT: addq $4, %rdi
; SSE2-NEXT: testb $8, %al
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index dbd95213a60d..c3020b5f467c 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -34,23 +34,23 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; SSE-NEXT: retq
; SSE-NEXT: .LBB0_1: # %cond.load
; SSE-NEXT: movq %xmm0, %rcx
-; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7]
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je .LBB0_4
; SSE-NEXT: .LBB0_3: # %cond.load1
; SSE-NEXT: pextrq $1, %xmm0, %rcx
-; SSE-NEXT: insertps $16, (%rcx), %xmm3 # xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je .LBB0_6
; SSE-NEXT: .LBB0_5: # %cond.load4
; SSE-NEXT: movq %xmm1, %rcx
-; SSE-NEXT: insertps $32, (%rcx), %xmm3 # xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je .LBB0_8
; SSE-NEXT: .LBB0_7: # %cond.load7
; SSE-NEXT: pextrq $1, %xmm1, %rax
-; SSE-NEXT: insertps $48, (%rax), %xmm3 # xmm3 = xmm3[0,1,2],mem[0]
+; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
; SSE-NEXT: movaps %xmm3, %xmm0
; SSE-NEXT: retq
;
@@ -63,14 +63,14 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; AVX1-NEXT: je .LBB0_2
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: .LBB0_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB0_4
; AVX1-NEXT: # %bb.3: # %cond.load1
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; AVX1-NEXT: .LBB0_4: # %else2
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -84,12 +84,12 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB0_5: # %cond.load4
; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB0_8
; AVX1-NEXT: .LBB0_7: # %cond.load7
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: vmovaps %xmm2, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -103,14 +103,14 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; AVX2-NEXT: je .LBB0_2
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: .LBB0_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB0_4
; AVX2-NEXT: # %bb.3: # %cond.load1
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; AVX2-NEXT: .LBB0_4: # %else2
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -124,12 +124,12 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB0_5: # %cond.load4
; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB0_8
; AVX2-NEXT: .LBB0_7: # %cond.load7
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: vmovaps %xmm2, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -159,7 +159,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; SSE-NEXT: pmovsxdq %xmm0, %xmm4
; SSE-NEXT: psllq $2, %xmm4
; SSE-NEXT: paddq %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: pmovsxdq %xmm0, %xmm0
; SSE-NEXT: pxor %xmm5, %xmm5
; SSE-NEXT: pcmpeqd %xmm1, %xmm5
@@ -168,7 +168,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; SSE-NEXT: je .LBB1_2
; SSE-NEXT: # %bb.1: # %cond.load
; SSE-NEXT: movq %xmm4, %rcx
-; SSE-NEXT: movd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; SSE-NEXT: .LBB1_2: # %else
; SSE-NEXT: psllq $2, %xmm0
@@ -176,7 +176,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; SSE-NEXT: je .LBB1_4
; SSE-NEXT: # %bb.3: # %cond.load1
; SSE-NEXT: pextrq $1, %xmm4, %rcx
-; SSE-NEXT: insertps $16, (%rcx), %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; SSE-NEXT: .LBB1_4: # %else2
; SSE-NEXT: paddq %xmm0, %xmm3
; SSE-NEXT: testb $4, %al
@@ -189,12 +189,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; SSE-NEXT: retq
; SSE-NEXT: .LBB1_5: # %cond.load4
; SSE-NEXT: movq %xmm3, %rcx
-; SSE-NEXT: insertps $32, (%rcx), %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je .LBB1_8
; SSE-NEXT: .LBB1_7: # %cond.load7
; SSE-NEXT: pextrq $1, %xmm3, %rax
-; SSE-NEXT: insertps $48, (%rax), %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -202,7 +202,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4
; AVX1-NEXT: vpsllq $2, %xmm4, %xmm4
; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm4
@@ -217,14 +217,14 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX1-NEXT: je .LBB1_2
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: .LBB1_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB1_4
; AVX1-NEXT: # %bb.3: # %cond.load1
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; AVX1-NEXT: .LBB1_4: # %else2
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -238,12 +238,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB1_5: # %cond.load4
; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB1_8
; AVX1-NEXT: .LBB1_7: # %cond.load7
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: vmovaps %xmm2, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -262,14 +262,14 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX2-NEXT: je .LBB1_2
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: .LBB1_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB1_4
; AVX2-NEXT: # %bb.3: # %cond.load1
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; AVX2-NEXT: .LBB1_4: # %else2
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -283,12 +283,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB1_5: # %cond.load4
; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB1_8
; AVX2-NEXT: .LBB1_7: # %cond.load7
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: vmovaps %xmm2, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -328,7 +328,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; SSE-NEXT: je .LBB2_2
; SSE-NEXT: # %bb.1: # %cond.load
; SSE-NEXT: movq %xmm0, %rcx
-; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7]
; SSE-NEXT: .LBB2_2: # %else
; SSE-NEXT: psllq $2, %xmm1
@@ -336,7 +336,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; SSE-NEXT: je .LBB2_4
; SSE-NEXT: # %bb.3: # %cond.load1
; SSE-NEXT: pextrq $1, %xmm0, %rcx
-; SSE-NEXT: insertps $16, (%rcx), %xmm3 # xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
; SSE-NEXT: .LBB2_4: # %else2
; SSE-NEXT: paddq %xmm1, %xmm4
; SSE-NEXT: testb $4, %al
@@ -349,12 +349,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; SSE-NEXT: retq
; SSE-NEXT: .LBB2_5: # %cond.load4
; SSE-NEXT: movq %xmm4, %rcx
-; SSE-NEXT: insertps $32, (%rcx), %xmm3 # xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je .LBB2_8
; SSE-NEXT: .LBB2_7: # %cond.load7
; SSE-NEXT: pextrq $1, %xmm4, %rax
-; SSE-NEXT: insertps $48, (%rax), %xmm3 # xmm3 = xmm3[0,1,2],mem[0]
+; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
; SSE-NEXT: movaps %xmm3, %xmm0
; SSE-NEXT: retq
;
@@ -375,14 +375,14 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; AVX1-NEXT: je .LBB2_2
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: .LBB2_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB2_4
; AVX1-NEXT: # %bb.3: # %cond.load1
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; AVX1-NEXT: .LBB2_4: # %else2
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -396,12 +396,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB2_5: # %cond.load4
; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB2_8
; AVX1-NEXT: .LBB2_7: # %cond.load7
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: vmovaps %xmm2, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -419,14 +419,14 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; AVX2-NEXT: je .LBB2_2
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: .LBB2_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB2_4
; AVX2-NEXT: # %bb.3: # %cond.load1
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; AVX2-NEXT: .LBB2_4: # %else2
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -440,12 +440,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB2_5: # %cond.load4
; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB2_8
; AVX2-NEXT: .LBB2_7: # %cond.load7
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: vmovaps %xmm2, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -480,7 +480,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE-NEXT: pmovsxdq %xmm0, %xmm0
; SSE-NEXT: paddq %xmm8, %xmm0
; SSE-NEXT: pxor %xmm6, %xmm6
@@ -513,7 +513,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm4, %rcx
; SSE-NEXT: pinsrb $3, (%rcx), %xmm5
; SSE-NEXT: .LBB3_8: # %else8
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: paddq %xmm8, %xmm0
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je .LBB3_10
@@ -542,7 +542,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm1, %rcx
; SSE-NEXT: pinsrb $7, (%rcx), %xmm5
; SSE-NEXT: .LBB3_16: # %else20
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE-NEXT: paddq %xmm8, %xmm0
; SSE-NEXT: testl $256, %eax # imm = 0x100
; SSE-NEXT: je .LBB3_18
@@ -571,7 +571,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm1, %rcx
; SSE-NEXT: pinsrb $11, (%rcx), %xmm5
; SSE-NEXT: .LBB3_24: # %else32
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE-NEXT: paddq %xmm8, %xmm0
; SSE-NEXT: testl $4096, %eax # imm = 0x1000
; SSE-NEXT: je .LBB3_26
@@ -611,7 +611,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vmovq %rdi, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -626,7 +626,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: vpinsrb $0, (%rcx), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_2: # %else
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB3_4
@@ -657,7 +657,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: vpinsrb $4, (%rcx), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_10: # %else11
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB3_12
@@ -689,7 +689,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: vpinsrb $8, (%rcx), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_18: # %else23
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
; AVX1-NEXT: testl $512, %eax # imm = 0x200
; AVX1-NEXT: je .LBB3_20
@@ -1040,7 +1040,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; SSE-NEXT: je .LBB4_1
; SSE-NEXT: # %bb.2: # %cond.load
; SSE-NEXT: movq %xmm5, %rcx
-; SSE-NEXT: movd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne .LBB4_4
; SSE-NEXT: jmp .LBB4_5
@@ -1105,7 +1105,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; SSE-NEXT: je .LBB4_19
; SSE-NEXT: # %bb.20: # %cond.load23
; SSE-NEXT: movq %xmm4, %rcx
-; SSE-NEXT: movd (%rcx), %xmm5 # xmm5 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne .LBB4_22
; SSE-NEXT: jmp .LBB4_23
@@ -1174,7 +1174,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; SSE-NEXT: je .LBB4_37
; SSE-NEXT: # %bb.38: # %cond.load72
; SSE-NEXT: movq %xmm4, %rcx
-; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne .LBB4_40
; SSE-NEXT: jmp .LBB4_41
@@ -1260,7 +1260,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: je .LBB4_2
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm3, %rdx
-; AVX1-NEXT: vmovd (%rdx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: .LBB4_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_4
@@ -1334,7 +1334,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: je .LBB4_18
; AVX1-NEXT: # %bb.17: # %cond.load23
; AVX1-NEXT: vmovq %xmm7, %rcx
-; AVX1-NEXT: vmovd (%rcx), %xmm4 # xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: .LBB4_18: # %else27
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_20
@@ -1405,7 +1405,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: je .LBB4_34
; AVX1-NEXT: # %bb.33: # %cond.load72
; AVX1-NEXT: vmovq %xmm7, %rcx
-; AVX1-NEXT: vmovd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: .LBB4_34: # %else76
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_36
@@ -1491,7 +1491,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: je .LBB4_2
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm3, %rcx
-; AVX2-NEXT: vmovd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: .LBB4_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB4_4
@@ -1534,7 +1534,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: je .LBB4_18
; AVX2-NEXT: # %bb.17: # %cond.load23
; AVX2-NEXT: vmovq %xmm3, %rcx
-; AVX2-NEXT: vmovd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX2-NEXT: .LBB4_18: # %else27
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB4_20
@@ -1678,7 +1678,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: jmp .LBB4_32
; AVX2-NEXT: .LBB4_33: # %cond.load72
; AVX2-NEXT: vmovq %xmm3, %rcx
-; AVX2-NEXT: vmovd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB4_36
; AVX2-NEXT: .LBB4_35: # %cond.load78
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index a14d537bc25d..e8dc7412eef8 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -153,7 +153,7 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
@@ -233,7 +233,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
@@ -458,12 +458,12 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <
;
; AVX1-LABEL: load_v8f64_v8i16:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
@@ -479,7 +479,7 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <
;
; AVX2-LABEL: load_v8f64_v8i16:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
@@ -1778,12 +1778,12 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6
;
; AVX1-LABEL: load_v8i64_v8i16:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
@@ -1799,7 +1799,7 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6
;
; AVX2-LABEL: load_v8i64_v8i16:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 29678e8495c2..389281726d27 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -838,7 +838,7 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB7_4
; SSE2-NEXT: LBB7_3: ## %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -922,7 +922,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB8_4
; SSE2-NEXT: LBB8_3: ## %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm0, 8(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB8_6
@@ -931,7 +931,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB8_8
; SSE2-NEXT: LBB8_7: ## %cond.store5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm0, 24(%rdi)
; SSE2-NEXT: retq
;
@@ -1158,7 +1158,7 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB11_6
; SSE2-NEXT: LBB11_5: ## %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB11_8
@@ -1280,7 +1280,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB12_6
; SSE2-NEXT: LBB12_5: ## %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB12_8
@@ -1299,7 +1299,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB12_14
; SSE2-NEXT: LBB12_13: ## %cond.store11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 24(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB12_16
@@ -4674,7 +4674,7 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub
; AVX1: ## %bb.0:
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi)
@@ -4853,7 +4853,7 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je LBB25_6
; SSE2-NEXT: LBB25_5: ## %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 341a34991118..640e145c2023 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -60,7 +60,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB0_6
; SSE2-NEXT: .LBB0_5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne .LBB0_7
@@ -75,7 +75,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je .LBB0_14
; SSE2-NEXT: .LBB0_13: # %cond.store11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 24(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB0_16
@@ -1030,7 +1030,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB3_6
; SSE2-NEXT: .LBB3_5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB3_8
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index c120684c4261..17c113f098eb 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -141,7 +141,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB0_6
; SSE2-NEXT: # %bb.5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm4, 8(%rdi)
; SSE2-NEXT: .LBB0_6: # %else4
; SSE2-NEXT: por %xmm3, %xmm2
@@ -176,7 +176,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je .LBB0_14
; SSE2-NEXT: .LBB0_13: # %cond.store11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 24(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB0_16
@@ -1579,7 +1579,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB3_6
; SSE2-NEXT: .LBB3_5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB3_8
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 0160733732d7..2ca9ebb0d5c6 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -87,7 +87,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB0_6
; SSE2-NEXT: # %bb.5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: .LBB0_6: # %else4
; SSE2-NEXT: por %xmm0, %xmm3
@@ -122,7 +122,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je .LBB0_14
; SSE2-NEXT: .LBB0_13: # %cond.store11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 24(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB0_16
@@ -1351,7 +1351,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB3_6
; SSE2-NEXT: .LBB3_5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB3_8
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
index 8b3756a1fa24..6dcc47b9a65c 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -309,7 +309,7 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -320,7 +320,7 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
@@ -348,12 +348,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
; X64-SSE2-NEXT: movq %xmm0, %rax
; X64-SSE2-NEXT: movntiq %rax, (%rsi)
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movq %xmm0, %rax
; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
; X64-SSE2-NEXT: movq %xmm1, %rax
; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movq %xmm0, %rax
; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
; X64-SSE2-NEXT: retq
@@ -422,7 +422,7 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -433,7 +433,7 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
@@ -461,12 +461,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
; X64-SSE2-NEXT: movq %xmm0, %rax
; X64-SSE2-NEXT: movntiq %rax, (%rsi)
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-SSE2-NEXT: movq %xmm0, %rax
; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
; X64-SSE2-NEXT: movq %xmm1, %rax
; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X64-SSE2-NEXT: movq %xmm0, %rax
; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
; X64-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 2980a1665db1..666df4a1b960 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -197,7 +197,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -263,7 +263,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -327,7 +327,7 @@ define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -388,7 +388,7 @@ define i32 @sad_16i8_512() "min-legal-vector-width"="512" {
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -955,10 +955,10 @@ define void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal
; CHECK-LABEL: zext_v16i8_v16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
@@ -977,10 +977,10 @@ define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal
; CHECK-LABEL: sext_v16i8_v16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3
; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll
index c83c675e7889..4e83a7ce7231 100644
--- a/llvm/test/CodeGen/X86/nontemporal-2.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-2.ll
@@ -595,14 +595,14 @@ define void @test_extract_f64(<2 x double> %arg, double* %dst) {
define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
; SSE2-LABEL: test_extract_i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movntiq %rax, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_extract_i64:
; SSE4A: # %bb.0:
-; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE4A-NEXT: movq %xmm0, %rax
; SSE4A-NEXT: movntiq %rax, (%rdi)
; SSE4A-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 2f74c830221b..d24fd3f024d4 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -9,7 +9,7 @@
define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
; SSE2-LABEL: v3i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: movq %xmm2, 16(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
@@ -285,7 +285,7 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: movw %ax, 12(%rdi)
; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -391,7 +391,7 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movq %xmm2, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -539,7 +539,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
@@ -637,7 +637,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -1202,7 +1202,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE2-NEXT: movups 32(%rdi), %xmm10
; SSE2-NEXT: movups 48(%rdi), %xmm12
; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3]
@@ -1215,7 +1215,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3]
@@ -1243,12 +1243,12 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE42-NEXT: movdqu 48(%rdi), %xmm5
; SSE42-NEXT: movdqa %xmm2, %xmm6
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1]
; SSE42-NEXT: movdqa %xmm9, %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1]
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,2]
@@ -1476,7 +1476,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3]
@@ -1489,7 +1489,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll
index dd258c5f424a..7f3eb0898cc8 100644
--- a/llvm/test/CodeGen/X86/phaddsub-extract.ll
+++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll
@@ -46,7 +46,7 @@ define i32 @extract_extract01_v4i32_add_i32(<4 x i32> %x) {
define i32 @extract_extract23_v4i32_add_i32(<4 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm1, %ecx
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
@@ -114,7 +114,7 @@ define i32 @extract_extract01_v4i32_add_i32_commute(<4 x i32> %x) {
define i32 @extract_extract23_v4i32_add_i32_commute(<4 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm1, %ecx
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
@@ -326,7 +326,7 @@ define i32 @extract_extract01_v4i32_sub_i32(<4 x i32> %x) {
define i32 @extract_extract23_v4i32_sub_i32(<4 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract23_v4i32_sub_i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm1, %eax
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %ecx
@@ -382,7 +382,7 @@ define i32 @extract_extract01_v4i32_sub_i32_commute(<4 x i32> %x) {
define i32 @extract_extract23_v4i32_sub_i32_commute(<4 x i32> %x) {
; SSE3-LABEL: extract_extract23_v4i32_sub_i32_commute:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm1, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
@@ -557,7 +557,7 @@ define i32 @extract_extract01_v8i32_add_i32(<8 x i32> %x) {
define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm1, %ecx
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
@@ -594,7 +594,7 @@ define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) {
define i32 @extract_extract67_v8i32_add_i32(<8 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %ecx
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
@@ -684,7 +684,7 @@ define i32 @extract_extract01_v8i32_add_i32_commute(<8 x i32> %x) {
define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm1, %ecx
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
@@ -721,7 +721,7 @@ define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) {
define i32 @extract_extract67_v8i32_add_i32_commute(<8 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %ecx
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
@@ -1119,7 +1119,7 @@ define i32 @extract_extract01_v8i32_sub_i32(<8 x i32> %x) {
define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract23_v8i32_sub_i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm1, %eax
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %ecx
@@ -1156,7 +1156,7 @@ define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) {
define i32 @extract_extract67_v8i32_sub_i32(<8 x i32> %x) {
; SSE3-SLOW-LABEL: extract_extract67_v8i32_sub_i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %eax
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-SLOW-NEXT: movd %xmm0, %ecx
@@ -1672,7 +1672,7 @@ define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, i32* %p1, i32* %
define i32 @partial_reduction_add_v8i32(<8 x i32> %x) {
; SSE3-SLOW-LABEL: partial_reduction_add_v8i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -1681,7 +1681,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) {
;
; SSE3-FAST-LABEL: partial_reduction_add_v8i32:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
; SSE3-FAST-NEXT: movd %xmm1, %eax
@@ -1689,7 +1689,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) {
;
; AVX-SLOW-LABEL: partial_reduction_add_v8i32:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1715,7 +1715,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) {
define i32 @partial_reduction_add_v16i32(<16 x i32> %x) {
; SSE3-SLOW-LABEL: partial_reduction_add_v16i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -1724,7 +1724,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) {
;
; SSE3-FAST-LABEL: partial_reduction_add_v16i32:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
; SSE3-FAST-NEXT: movd %xmm1, %eax
@@ -1732,7 +1732,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) {
;
; AVX-SLOW-LABEL: partial_reduction_add_v16i32:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1758,7 +1758,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) {
define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
@@ -1767,7 +1767,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
;
; SSE3-FAST-LABEL: partial_reduction_sub_v8i32:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-FAST-NEXT: psubd %xmm1, %xmm0
; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0
; SSE3-FAST-NEXT: movd %xmm0, %eax
@@ -1775,7 +1775,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
;
; AVX-SLOW-LABEL: partial_reduction_sub_v8i32:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1785,7 +1785,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
;
; AVX-FAST-LABEL: partial_reduction_sub_v8i32:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
@@ -1802,7 +1802,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0
@@ -1811,7 +1811,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
;
; SSE3-FAST-LABEL: partial_reduction_sub_v16i32:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-FAST-NEXT: psubd %xmm1, %xmm0
; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0
; SSE3-FAST-NEXT: movd %xmm0, %eax
@@ -1819,7 +1819,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
;
; AVX-SLOW-LABEL: partial_reduction_sub_v16i32:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1829,7 +1829,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
;
; AVX1-FAST-LABEL: partial_reduction_sub_v16i32:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
@@ -1838,7 +1838,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
;
; AVX2-FAST-LABEL: partial_reduction_sub_v16i32:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1848,7 +1848,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
;
; AVX512-FAST-LABEL: partial_reduction_sub_v16i32:
; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -1868,7 +1868,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
define i16 @hadd16_8(<8 x i16> %x223) {
; SSE3-SLOW-LABEL: hadd16_8:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: paddw %xmm0, %xmm1
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-SLOW-NEXT: paddw %xmm1, %xmm0
@@ -1890,7 +1890,7 @@ define i16 @hadd16_8(<8 x i16> %x223) {
;
; AVX-SLOW-LABEL: hadd16_8:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -1921,7 +1921,7 @@ define i16 @hadd16_8(<8 x i16> %x223) {
define i32 @hadd32_4(<4 x i32> %x225) {
; SSE3-SLOW-LABEL: hadd32_4:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -1937,7 +1937,7 @@ define i32 @hadd32_4(<4 x i32> %x225) {
;
; AVX-SLOW-LABEL: hadd32_4:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1961,7 +1961,7 @@ define i32 @hadd32_4(<4 x i32> %x225) {
define i32 @hadd32_8(<8 x i32> %x225) {
; SSE3-SLOW-LABEL: hadd32_8:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -1970,7 +1970,7 @@ define i32 @hadd32_8(<8 x i32> %x225) {
;
; SSE3-FAST-LABEL: hadd32_8:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
; SSE3-FAST-NEXT: movd %xmm1, %eax
@@ -1978,7 +1978,7 @@ define i32 @hadd32_8(<8 x i32> %x225) {
;
; AVX-SLOW-LABEL: hadd32_8:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2004,7 +2004,7 @@ define i32 @hadd32_8(<8 x i32> %x225) {
define i32 @hadd32_16(<16 x i32> %x225) {
; SSE3-SLOW-LABEL: hadd32_16:
; SSE3-SLOW: # %bb.0:
-; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1
; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0
@@ -2013,7 +2013,7 @@ define i32 @hadd32_16(<16 x i32> %x225) {
;
; SSE3-FAST-LABEL: hadd32_16:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
; SSE3-FAST-NEXT: movd %xmm1, %eax
@@ -2021,7 +2021,7 @@ define i32 @hadd32_16(<16 x i32> %x225) {
;
; AVX-SLOW-LABEL: hadd32_16:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2119,7 +2119,7 @@ define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 {
define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize {
; SSE3-LABEL: hadd32_8_optsize:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-NEXT: paddd %xmm0, %xmm1
; SSE3-NEXT: phaddd %xmm1, %xmm1
; SSE3-NEXT: movd %xmm1, %eax
@@ -2143,7 +2143,7 @@ define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize {
define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize {
; SSE3-LABEL: hadd32_16_optsize:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE3-NEXT: paddd %xmm0, %xmm1
; SSE3-NEXT: phaddd %xmm1, %xmm1
; SSE3-NEXT: movd %xmm1, %eax
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 30f87e3d9b27..4285e7b603f8 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1083,7 +1083,7 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
;
; SSE41-LABEL: mul_v4i64_zero_upper_left:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: movdqa %xmm4, %xmm0
@@ -1252,14 +1252,14 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: pcmpgtd %xmm3, %xmm14
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1]
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
@@ -1306,7 +1306,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwq %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
@@ -1324,7 +1324,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
;
; AVX2-LABEL: mul_v8i64_sext:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 14fc182a334a..31e113f9a003 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -434,7 +434,7 @@ define <8 x i32> @mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmulhuw %xmm1, %xmm0
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -467,7 +467,7 @@ define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmulhw %xmm1, %xmm0
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -498,7 +498,7 @@ define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmulhw %xmm1, %xmm0
; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -538,9 +538,9 @@ define <16 x i32> @mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmulhuw %xmm3, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm5, %xmm1
@@ -589,9 +589,9 @@ define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmulhw %xmm3, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm5, %xmm1
@@ -639,9 +639,9 @@ define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) {
; SSE41-NEXT: pmovsxwd %xmm0, %xmm4
; SSE41-NEXT: pmulhw %xmm3, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm5, %xmm1
@@ -702,16 +702,16 @@ define <32 x i32> @mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhuw %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
; SSE41-NEXT: pmulhuw %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; SSE41-NEXT: pmulhuw %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; SSE41-NEXT: pmulhuw %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -800,16 +800,16 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhw %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
; SSE41-NEXT: pmulhw %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; SSE41-NEXT: pmulhw %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; SSE41-NEXT: pmulhw %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -901,16 +901,16 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhw %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm4, %xmm4
; SSE41-NEXT: pmulhw %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm5, %xmm5
; SSE41-NEXT: pmulhw %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm6, %xmm6
; SSE41-NEXT: pmulhw %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm7, %xmm7
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
@@ -1026,28 +1026,28 @@ define <64 x i32> @mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1204,28 +1204,28 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1386,28 +1386,28 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm8, %xmm8
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm9, %xmm9
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm10, %xmm10
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm11, %xmm11
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm12, %xmm12
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm13, %xmm13
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm14, %xmm14
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm15, %xmm15
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
@@ -1541,7 +1541,7 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -1552,7 +1552,7 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX2-NEXT: retq
;
@@ -1648,7 +1648,7 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -1659,7 +1659,7 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX2-NEXT: retq
;
@@ -1775,7 +1775,7 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pmovsxwq %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwq %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm0, %xmm3
@@ -1786,7 +1786,7 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll
index 73acb76ce55f..b8ecadba81c0 100644
--- a/llvm/test/CodeGen/X86/pr15267.ll
+++ b/llvm/test/CodeGen/X86/pr15267.ll
@@ -75,7 +75,7 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
; CHECK-NEXT: negl %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr39733.ll b/llvm/test/CodeGen/X86/pr39733.ll
index 75f9dc51b85e..31bd5b71d0a6 100644
--- a/llvm/test/CodeGen/X86/pr39733.ll
+++ b/llvm/test/CodeGen/X86/pr39733.ll
@@ -21,7 +21,7 @@ define void @test55() {
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1
; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vmovaps %xmm1, %xmm2
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; CHECK-NEXT: vmovdqa %ymm0, (%rsp)
diff --git a/llvm/test/CodeGen/X86/pr42452.ll b/llvm/test/CodeGen/X86/pr42452.ll
index f2f0cd2d3ce6..d3a1dad42bd3 100644
--- a/llvm/test/CodeGen/X86/pr42452.ll
+++ b/llvm/test/CodeGen/X86/pr42452.ll
@@ -8,7 +8,7 @@ define void @foo(i1 %c, <2 x i64> %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $dil killed $dil killed $edi
; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rcx
; CHECK-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll
index 310a173f824e..6ebe5be45a4f 100644
--- a/llvm/test/CodeGen/X86/pr42905.ll
+++ b/llvm/test/CodeGen/X86/pr42905.ll
@@ -7,7 +7,7 @@ define <4 x double> @autogen_SD30452(i1 %L230) {
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [151829,151829]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm2, %rax
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2sd %rax, %xmm2
diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll
index 871937d29550..72ced2b1fa0f 100644
--- a/llvm/test/CodeGen/X86/pr44976.ll
+++ b/llvm/test/CodeGen/X86/pr44976.ll
@@ -57,7 +57,7 @@ define <3 x i32> @f_29(<12 x i16> %a, <12 x i16> %b) {
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,1]
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,2,3]
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; CHECK-NEXT: paddd %xmm3, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index 681e06ada65d..36b2de07bcc8 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -77,7 +77,7 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: testq %rcx, %rax
; SSE2-NEXT: sete %al
diff --git a/llvm/test/CodeGen/X86/pr46189.ll b/llvm/test/CodeGen/X86/pr46189.ll
index 558483754c68..97190c10ef7c 100644
--- a/llvm/test/CodeGen/X86/pr46189.ll
+++ b/llvm/test/CodeGen/X86/pr46189.ll
@@ -21,7 +21,7 @@ define { i64, i64 } @PR46189(double %0, double %1, double %2, double %3, double
; SSE-NEXT: cvttpd2dq %xmm3, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: movq %xmm0, %rdx
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll
index e5ed94aa5493..c232d548faef 100644
--- a/llvm/test/CodeGen/X86/pr46455.ll
+++ b/llvm/test/CodeGen/X86/pr46455.ll
@@ -10,7 +10,7 @@ define void @EntryModule(i8** %buffer_table) {
; CHECK-NEXT: vcmpneqps (%rax), %ymm0, %ymm0
; CHECK-NEXT: vpsrld $31, %xmm0, %xmm1
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; CHECK-NEXT: vpsubd %xmm0, %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 0706254f4e5c..bdbc56381686 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -135,7 +135,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2
; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 28ab2e1968db..d785b10f9c32 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -522,7 +522,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test13:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm5, %xmm3
@@ -697,7 +697,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
@@ -735,9 +735,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6
@@ -772,7 +772,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
;
; AVX2-LABEL: test14:
; AVX2: # %bb.0: # %vector.ph
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4
@@ -873,7 +873,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test15:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm4, %xmm5
@@ -1005,7 +1005,7 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test16:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmaxud %xmm1, %xmm4
@@ -1871,10 +1871,10 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
;
; SSE41-LABEL: psubus_16i32_max:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmaxud %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index f55a58048e22..a197e795754a 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -27,7 +27,7 @@ define i32 @sad_16i8() nounwind {
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -54,7 +54,7 @@ define i32 @sad_16i8() nounwind {
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -79,7 +79,7 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -104,7 +104,7 @@ define i32 @sad_16i8() nounwind {
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -172,7 +172,7 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -207,7 +207,7 @@ define i32 @sad_32i8() nounwind {
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -234,7 +234,7 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -261,7 +261,7 @@ define i32 @sad_32i8() nounwind {
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -346,7 +346,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -397,7 +397,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -431,7 +431,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -463,7 +463,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -492,7 +492,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -628,7 +628,7 @@ define i32 @sad_4i8() nounwind {
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB4_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -649,7 +649,7 @@ define i32 @sad_4i8() nounwind {
; AVX-NEXT: addq $4, %rax
; AVX-NEXT: jne .LBB4_1
; AVX-NEXT: # %bb.2: # %middle.block
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -760,7 +760,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu (%rdx), %xmm1
; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
@@ -769,7 +769,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
; AVX: # %bb.0:
; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
@@ -803,7 +803,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-NEXT: movdqu 16(%rdi), %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: paddq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
@@ -815,7 +815,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1
; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
@@ -826,7 +826,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
@@ -838,7 +838,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vzeroupper
@@ -883,7 +883,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n
; SSE2-NEXT: paddq %xmm0, %xmm2
; SSE2-NEXT: paddq %xmm1, %xmm2
; SSE2-NEXT: paddq %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
@@ -901,7 +901,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n
; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
@@ -915,7 +915,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
@@ -931,7 +931,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n
; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
@@ -945,7 +945,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n
; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
@@ -987,7 +987,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
; SSE2-NEXT: psadbw %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -1002,7 +1002,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1051,7 +1051,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
; SSE2-NEXT: movdqu (%rcx), %xmm2
; SSE2-NEXT: psadbw %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -1065,7 +1065,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 02400d28c0e2..3b3a1b57ecd0 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -443,13 +443,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pcmpgtd %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movdqa %xmm1, %xmm4
; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; X64-NEXT: movq %xmm4, %rcx
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pcmpgtd %xmm0, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: psllq $31, %xmm0
; X64-NEXT: movq %xmm0, %rax
@@ -457,9 +457,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: idivq %rcx
; X64-NEXT: movq %rax, %r8
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
; X64-NEXT: movq %xmm2, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm2, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
@@ -478,9 +478,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: idivq %rdi
; X64-NEXT: movq %rax, %r9
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X64-NEXT: movq %xmm2, %rsi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm2, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rsi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e5348b9febb4..512488e8f872 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -651,14 +651,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: cmoveq %rax, %r13
; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rbx
; X64-NEXT: movq %rbx, %r13
; X64-NEXT: sarq $63, %r13
; X64-NEXT: shldq $31, %rbx, %r13
-; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %rbp
@@ -709,8 +709,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X64-NEXT: # xmm1 = mem[2,3,0,1]
+; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X64-NEXT: # xmm1 = mem[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -720,8 +720,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movq %rbx, %r12
; X64-NEXT: sarq $63, %r12
; X64-NEXT: shldq $31, %rbx, %r12
-; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X64-NEXT: # xmm1 = mem[2,3,0,1]
+; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X64-NEXT: # xmm1 = mem[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -773,14 +773,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: cmoveq %rax, %r13
; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rbx
; X64-NEXT: movq %rbx, %r13
; X64-NEXT: sarq $63, %r13
; X64-NEXT: shldq $31, %rbx, %r13
-; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %rbp
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 74a6ae58f9f9..8a788f41d5cc 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -75,16 +75,16 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: ne_i256:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm4, %rcx
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: movq %xmm1, %r8
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: xorq %rax, %rdi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rsi
; SSE2-NEXT: xorq %rcx, %rsi
; SSE2-NEXT: orq %rdi, %rsi
@@ -155,16 +155,16 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: eq_i256:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm4, %rcx
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: movq %xmm1, %r8
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: xorq %rax, %rdi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rsi
; SSE2-NEXT: xorq %rcx, %rsi
; SSE2-NEXT: orq %rdi, %rsi
@@ -235,28 +235,28 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE2-LABEL: ne_i512:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rdx
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rsi
; SSE2-NEXT: movq %xmm0, %r11
; SSE2-NEXT: movq %xmm2, %r8
; SSE2-NEXT: movq %xmm1, %r9
; SSE2-NEXT: movq %xmm3, %r10
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: xorq %rax, %rdi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: xorq %rdx, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
; SSE2-NEXT: orq %rcx, %rdx
@@ -426,28 +426,28 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE2-LABEL: eq_i512:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rdx
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rsi
; SSE2-NEXT: movq %xmm0, %r11
; SSE2-NEXT: movq %xmm2, %r8
; SSE2-NEXT: movq %xmm1, %r9
; SSE2-NEXT: movq %xmm3, %r10
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: xorq %rax, %rdi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: xorq %rdx, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
; SSE2-NEXT: orq %rcx, %rdx
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 6a5fab8469fa..17cfc5326690 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -2094,9 +2094,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %esi
; X86-SSE-NEXT: movd %edx, %xmm3
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
; X86-SSE-NEXT: movd %xmm7, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
; X86-SSE-NEXT: movd %xmm7, %esi
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %esi
@@ -2137,9 +2137,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %esi
; X86-SSE-NEXT: movd %edx, %xmm4
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X86-SSE-NEXT: movd %xmm2, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X86-SSE-NEXT: movd %xmm1, %esi
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl %esi
@@ -2336,9 +2336,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
; X64-SSE-NEXT: movd %edx, %xmm3
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
; X64-SSE-NEXT: movd %xmm7, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
; X64-SSE-NEXT: movd %xmm7, %ecx
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
@@ -2379,9 +2379,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
; X64-SSE-NEXT: movd %edx, %xmm4
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X64-SSE-NEXT: movd %xmm2, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-SSE-NEXT: movd %xmm1, %ecx
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %ecx
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 0b79b62f84a1..53deafc9a4b4 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -224,7 +224,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLM32: # %bb.0:
; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLM32-NEXT: movdqa %xmm1, %xmm4
; SLM32-NEXT: movdqa %xmm3, %xmm5
@@ -244,7 +244,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLM64: # %bb.0:
; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLM64-NEXT: movdqa %xmm1, %xmm4
; SLM64-NEXT: movdqa %xmm3, %xmm5
@@ -270,7 +270,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLOW32-NEXT: movdqa %xmm1, %xmm4
; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLOW32-NEXT: movdqa %xmm3, %xmm0
; SLOW32-NEXT: pmulhw %xmm2, %xmm0
@@ -291,7 +291,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLOW64-NEXT: movdqa %xmm1, %xmm4
; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLOW64-NEXT: movdqa %xmm3, %xmm0
; SLOW64-NEXT: pmulhw %xmm2, %xmm0
@@ -306,7 +306,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SSE4-32: # %bb.0:
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -322,7 +322,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SSE4-64: # %bb.0:
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -336,7 +336,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
;
; AVX2-32-LABEL: test_mul_v16i32_v16i8:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
@@ -346,7 +346,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
;
; AVX2-64-LABEL: test_mul_v16i32_v16i8:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
@@ -494,7 +494,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
;
; SSE4-32-LABEL: test_mul_v8i32_v8i16:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
@@ -504,7 +504,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
;
; SSE4-64-LABEL: test_mul_v8i32_v8i16:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
@@ -609,9 +609,9 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
;
; SSE4-32-LABEL: test_mul_v16i32_v16i16:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -625,9 +625,9 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
;
; SSE4-64-LABEL: test_mul_v16i32_v16i16:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -880,7 +880,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
@@ -896,7 +896,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
@@ -910,7 +910,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLOW32: # %bb.0:
; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -926,7 +926,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLOW64: # %bb.0:
; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -942,7 +942,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SSE4-32: # %bb.0:
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -958,7 +958,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SSE4-64: # %bb.0:
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -972,7 +972,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
;
; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
@@ -982,7 +982,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
;
; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
@@ -1077,7 +1077,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
; SLM32-LABEL: test_mul_v8i32_v8i16_minsize:
; SLM32: # %bb.0:
; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLM32-NEXT: pmulld %xmm2, %xmm0
@@ -1087,7 +1087,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
; SLM64-LABEL: test_mul_v8i32_v8i16_minsize:
; SLM64: # %bb.0:
; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLM64-NEXT: pmulld %xmm2, %xmm0
@@ -1096,7 +1096,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
;
; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
@@ -1106,7 +1106,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
;
; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
@@ -1116,7 +1116,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
;
; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
@@ -1126,7 +1126,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
;
; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
@@ -1155,9 +1155,9 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
; SLM32: # %bb.0:
-; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1171,9 +1171,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
;
; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
; SLM64: # %bb.0:
-; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1187,9 +1187,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
;
; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1203,9 +1203,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
;
; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1219,9 +1219,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
;
; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1235,9 +1235,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
;
; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 284f51d7422e..e0c1b762c150 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -206,10 +206,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; X64-NEXT: cmovll %ecx, %edx
; X64-NEXT: movd %edx, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movd %xmm3, %edx
; X64-NEXT: movslq %edx, %rdx
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movd %xmm3, %esi
; X64-NEXT: movslq %esi, %rsi
; X64-NEXT: imulq %rdx, %rsi
@@ -476,9 +476,9 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF
; X64-NEXT: imull %edx, %ecx
; X64-NEXT: cmovol %edi, %ecx
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; X64-NEXT: movd %xmm2, %edx
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: movl %esi, %edi
; X64-NEXT: imull %edx, %edi
diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
index 7f0e2cfa44b0..0abac9209905 100644
--- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
+++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -5,7 +5,7 @@
define <4 x i64> @autogen_SD88863() {
; CHECK-LABEL: autogen_SD88863:
; CHECK: # %bb.0: # %BB
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
diff --git a/llvm/test/CodeGen/X86/split-vector-rem.ll b/llvm/test/CodeGen/X86/split-vector-rem.ll
index ef03075ac65d..959c9bd0ff76 100644
--- a/llvm/test/CodeGen/X86/split-vector-rem.ll
+++ b/llvm/test/CodeGen/X86/split-vector-rem.ll
@@ -12,9 +12,9 @@ define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) {
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movd %edx, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; CHECK-NEXT: movd %xmm5, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; CHECK-NEXT: movd %xmm5, %ecx
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
@@ -41,9 +41,9 @@ define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) {
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; CHECK-NEXT: movd %xmm4, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; CHECK-NEXT: movd %xmm4, %ecx
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
@@ -79,9 +79,9 @@ define <8 x i32> @bar(<8 x i32> %t, <8 x i32> %u) {
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movd %edx, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; CHECK-NEXT: movd %xmm5, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; CHECK-NEXT: movd %xmm5, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
@@ -108,9 +108,9 @@ define <8 x i32> @bar(<8 x i32> %t, <8 x i32> %u) {
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; CHECK-NEXT: movd %xmm4, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; CHECK-NEXT: movd %xmm4, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 85400656e2e5..206814cfcf1c 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -2791,8 +2791,8 @@ define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
;
; X64-SSE2-LABEL: test_mm_storeh_pi:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pshufd $78, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x4e]
-; X64-SSE2-NEXT: # xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: punpckhqdq %xmm0, %xmm0 # encoding: [0x66,0x0f,0x6d,0xc0]
+; X64-SSE2-NEXT: # xmm0 = xmm0[1,1]
; X64-SSE2-NEXT: movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
; X64-SSE2-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
; X64-SSE2-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 460987cd74df..e2973ebbab89 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -692,24 +692,24 @@ entry:
define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: insertps_from_shufflevector_i32_2:
; SSE: ## %bb.0: ## %entry
-; SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e]
-; SSE-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
+; SSE-NEXT: ## xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c]
; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX1-LABEL: insertps_from_shufflevector_i32_2:
; AVX1: ## %bb.0: ## %entry
-; AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
-; AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: insertps_from_shufflevector_i32_2:
; AVX512: ## %bb.0: ## %entry
-; AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
-; AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3]
; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
@@ -1875,8 +1875,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
; X86-SSE-LABEL: insertps_pr20411:
; X86-SSE: ## %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e]
-; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; X86-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
+; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X86-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
; X86-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; X86-SSE-NEXT: movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08]
@@ -1885,8 +1885,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
; X86-AVX1-LABEL: insertps_pr20411:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
-; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00]
@@ -1895,8 +1895,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
; X86-AVX512-LABEL: insertps_pr20411:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
-; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
@@ -1904,8 +1904,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
;
; X64-SSE-LABEL: insertps_pr20411:
; X64-SSE: ## %bb.0:
-; X64-SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e]
-; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; X64-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
+; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X64-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
; X64-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; X64-SSE-NEXT: movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f]
@@ -1913,8 +1913,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
;
; X64-AVX1-LABEL: insertps_pr20411:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
-; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07]
@@ -1922,8 +1922,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
;
; X64-AVX512-LABEL: insertps_pr20411:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
-; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1]
+; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll
index 05b2b6608add..4c7acf60d308 100644
--- a/llvm/test/CodeGen/X86/trunc-subvector.ll
+++ b/llvm/test/CodeGen/X86/trunc-subvector.ll
@@ -73,7 +73,7 @@ define <2 x i32> @test4(<8 x i32> %v) {
define <2 x i32> @test5(<8 x i32> %v) {
; SSE2-LABEL: test5:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
@@ -175,7 +175,7 @@ define <2 x i32> @test9(<8 x i32> %v) {
define <2 x i32> @test10(<8 x i32> %v) {
; SSE2-LABEL: test10:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 7540b394babd..e42e527553f1 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -248,9 +248,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; X64-NEXT: movq %xmm4, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
; X64-NEXT: movq %xmm4, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
@@ -264,9 +264,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm1, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 493515c418ca..f6c8baf2e923 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -335,9 +335,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm7
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X64-NEXT: movq %xmm2, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
; X64-NEXT: movq %xmm2, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
@@ -369,9 +369,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm1, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
diff --git a/llvm/test/CodeGen/X86/uint_to_fp-3.ll b/llvm/test/CodeGen/X86/uint_to_fp-3.ll
index 9efd9a5bef5f..ca46b48b7731 100644
--- a/llvm/test/CodeGen/X86/uint_to_fp-3.ll
+++ b/llvm/test/CodeGen/X86/uint_to_fp-3.ll
@@ -40,7 +40,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) {
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm1
; X32-SSE-NEXT: movaps %xmm2, %xmm0
; X32-SSE-NEXT: retl
@@ -55,7 +55,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) {
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm2
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm1
; X64-SSE-NEXT: movaps %xmm2, %xmm0
; X64-SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 18b769f32ec9..ce744f93cdfe 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -151,9 +151,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movl $-1, %eax
; X64-NEXT: cmoval %eax, %ecx
; X64-NEXT: movd %ecx, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movd %xmm3, %ecx
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movd %xmm3, %edx
; X64-NEXT: imulq %rcx, %rdx
; X64-NEXT: movq %rdx, %rcx
@@ -361,9 +361,9 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: cmovol %ecx, %eax
; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movd %xmm3, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT: movd %xmm3, %edx
; X64-NEXT: mull %edx
; X64-NEXT: cmovol %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 9fdb08ba6d4e..923aaa34f04d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -2411,7 +2411,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -2516,7 +2516,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 8bc971e79f50..0a057852613a 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -16,7 +16,7 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
; SSE3: # %bb.0:
; SSE3-NEXT: movq %xmm1, %rax
; SSE3-NEXT: andl $1, %eax
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE3-NEXT: movq %xmm1, %rcx
; SSE3-NEXT: andl $1, %ecx
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -29,7 +29,7 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %xmm1, %rax
; SSSE3-NEXT: andl $1, %eax
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSSE3-NEXT: movq %xmm1, %rcx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -69,7 +69,7 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm2, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm1, %esi
@@ -379,7 +379,7 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
; SSE3: # %bb.0:
; SSE3-NEXT: movq %xmm1, %rax
; SSE3-NEXT: andl $1, %eax
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE3-NEXT: movq %xmm1, %rcx
; SSE3-NEXT: andl $1, %ecx
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -391,7 +391,7 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %xmm1, %rax
; SSSE3-NEXT: andl $1, %eax
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSSE3-NEXT: movq %xmm1, %rcx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -430,7 +430,7 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE3-NEXT: movd %xmm2, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm1, %esi
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
index 04692995b7e9..94f7d7eeaf39 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
@@ -135,7 +135,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
-; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fstps (%esp)
@@ -154,7 +154,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE-64: # %bb.0:
; SSE-64-NEXT: movq %xmm0, %rax
; SSE-64-NEXT: cvtsi2ss %rax, %xmm1
-; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-64-NEXT: movq %xmm0, %rax
; SSE-64-NEXT: xorps %xmm0, %xmm0
; SSE-64-NEXT: cvtsi2ss %rax, %xmm0
@@ -172,7 +172,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE41-32-NEXT: andl $-8, %esp
; SSE41-32-NEXT: subl $24, %esp
; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
-; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE41-32-NEXT: fstps (%esp)
@@ -191,7 +191,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movq %xmm0, %rax
; SSE41-64-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-64-NEXT: movq %xmm0, %rax
; SSE41-64-NEXT: xorps %xmm0, %xmm0
; SSE41-64-NEXT: cvtsi2ss %rax, %xmm0
@@ -209,7 +209,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $24, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fstps (%esp)
@@ -236,7 +236,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1
-; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVX512DQ-32-NEXT: vzeroupper
@@ -271,7 +271,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
-; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
@@ -313,7 +313,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE-64-NEXT: # %bb.1:
; SSE-64-NEXT: addss %xmm0, %xmm0
; SSE-64-NEXT: .LBB3_2:
-; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-64-NEXT: movq %xmm1, %rax
; SSE-64-NEXT: movq %rax, %rcx
; SSE-64-NEXT: shrq %rcx
@@ -340,7 +340,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE41-32-NEXT: .cfi_def_cfa_register %ebp
; SSE41-32-NEXT: andl $-8, %esp
; SSE41-32-NEXT: subl $24, %esp
-; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
@@ -382,7 +382,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; SSE41-64-NEXT: # %bb.1:
; SSE41-64-NEXT: addss %xmm0, %xmm0
; SSE41-64-NEXT: .LBB3_2:
-; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-64-NEXT: movq %xmm1, %rax
; SSE41-64-NEXT: movq %rax, %rcx
; SSE41-64-NEXT: shrq %rcx
@@ -410,7 +410,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $24, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
@@ -471,7 +471,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1
-; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVX512DQ-32-NEXT: vzeroupper
@@ -1146,7 +1146,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $32, %esp
; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
-; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fstpl {{[0-9]+}}(%esp)
@@ -1164,7 +1164,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; SSE-64: # %bb.0:
; SSE-64-NEXT: movq %xmm0, %rax
; SSE-64-NEXT: cvtsi2sd %rax, %xmm1
-; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-64-NEXT: movq %xmm0, %rax
; SSE-64-NEXT: xorps %xmm0, %xmm0
; SSE-64-NEXT: cvtsi2sd %rax, %xmm0
@@ -1182,7 +1182,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; SSE41-32-NEXT: andl $-8, %esp
; SSE41-32-NEXT: subl $32, %esp
; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
-; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE41-32-NEXT: fstpl {{[0-9]+}}(%esp)
@@ -1200,7 +1200,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movq %xmm0, %rax
; SSE41-64-NEXT: cvtsi2sd %rax, %xmm1
-; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-64-NEXT: movq %xmm0, %rax
; SSE41-64-NEXT: xorps %xmm0, %xmm0
; SSE41-64-NEXT: cvtsi2sd %rax, %xmm0
@@ -1218,7 +1218,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $32, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index d97787e36cd8..a5519e68f73a 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -281,7 +281,7 @@ define <8 x float> @sitofp_v8i16_v8f32(<8 x i16> %x) #0 {
; AVX1-LABEL: sitofp_v8i16_v8f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll
index ed703f170847..c979050fc180 100644
--- a/llvm/test/CodeGen/X86/vec_cast2.ll
+++ b/llvm/test/CodeGen/X86/vec_cast2.ll
@@ -18,7 +18,7 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x i16> %src) {
; CHECK-LABEL: cvt_v8i16_v8f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 6cb352c3f348..a413752993b5 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -95,7 +95,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: cvtsi2sd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2sd %rax, %xmm0
@@ -305,14 +305,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: cvtsi2sd %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2sd %rax, %xmm0
; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: cvtsi2sd %rax, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2sd %rax, %xmm0
@@ -421,7 +421,7 @@ define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-LABEL: sitofp_4i32_to_4f64:
; SSE: # %bb.0:
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
@@ -440,7 +440,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -448,7 +448,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -468,7 +468,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -476,7 +476,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -504,7 +504,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -512,7 +512,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -533,7 +533,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -541,7 +541,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -1015,7 +1015,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
; SSE41-NEXT: por %xmm3, %xmm2
; SSE41-NEXT: subpd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: subpd %xmm3, %xmm1
@@ -1074,7 +1074,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1083,7 +1083,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -1103,7 +1103,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1112,7 +1112,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -1140,7 +1140,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1149,7 +1149,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -1170,7 +1170,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1179,7 +1179,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -1209,7 +1209,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
@@ -1274,7 +1274,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; SSE2-LABEL: sitofp_2i64_to_4f32_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
@@ -1345,7 +1345,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
@@ -1464,7 +1464,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX1-LABEL: sitofp_8i16_to_4f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -1568,7 +1568,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
@@ -1576,7 +1576,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
@@ -1719,7 +1719,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm2, %xmm0
@@ -1728,7 +1728,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
; AVX1-LABEL: sitofp_8i16_to_8f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -1868,7 +1868,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB41_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB41_4
@@ -1969,7 +1969,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE2-LABEL: uitofp_2i64_to_2f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB42_1
@@ -2102,7 +2102,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB43_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB43_4
@@ -2462,7 +2462,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
; SSE2-NEXT: addss %xmm2, %xmm2
; SSE2-NEXT: .LBB49_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB49_4
@@ -2494,7 +2494,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB49_9:
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB49_10
@@ -2769,7 +2769,7 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm2, %xmm0
@@ -2906,7 +2906,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2sd %rax, %xmm1
@@ -3103,7 +3103,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2sd %rax, %xmm1
@@ -3111,7 +3111,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2sd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: cvtsi2sd %rax, %xmm2
@@ -3209,7 +3209,7 @@ define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE-NEXT: retq
;
@@ -3229,7 +3229,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -3237,7 +3237,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd (%rdi), %xmm1
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -3259,7 +3259,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -3267,7 +3267,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd (%rdi), %xmm1
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -3770,7 +3770,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: subpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: subpd %xmm2, %xmm1
@@ -3831,7 +3831,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -3839,7 +3839,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -3861,7 +3861,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE2-NEXT: retq
;
@@ -3869,7 +3869,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -3894,7 +3894,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
@@ -3902,7 +3902,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
@@ -4073,7 +4073,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: movdqa 48(%rdi), %xmm3
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
@@ -4081,7 +4081,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
@@ -4090,7 +4090,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: xorps %xmm4, %xmm4
; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
@@ -4098,7 +4098,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
@@ -4378,7 +4378,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB83_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB83_4
@@ -4410,7 +4410,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB83_9:
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB83_10
@@ -4729,7 +4729,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: addss %xmm3, %xmm3
; SSE2-NEXT: .LBB87_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_4
@@ -4760,7 +4760,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB87_9:
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm5, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_10
@@ -4791,7 +4791,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm5
; SSE2-NEXT: addss %xmm5, %xmm5
; SSE2-NEXT: .LBB87_15:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_16
@@ -4826,7 +4826,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: .LBB87_21:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_22
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index dd3a733ab217..4f071c064e5c 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -107,7 +107,7 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -119,7 +119,7 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movq %xmm1, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, 8(%rdi)
; SSSE3-NEXT: retq
;
@@ -512,13 +512,13 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2)
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -644,7 +644,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
@@ -667,9 +667,9 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vmovdqa %xmm3, (%rdi)
@@ -683,7 +683,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %xmm3, (%rdi)
; AVX2-NEXT: retq
@@ -769,7 +769,7 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
@@ -889,7 +889,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: movw %cx, 9(%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm1, %edx
; SSE2-NEXT: movw %dx, 6(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
@@ -924,7 +924,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSSE3-NEXT: movd %xmm1, %ecx
; SSSE3-NEXT: movw %cx, 9(%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, %edx
; SSSE3-NEXT: movw %dx, 6(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 5fde07d1269d..b5fefe296d77 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -1193,13 +1193,13 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2)
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpacksswb %xmm1, %xmm5, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6
+; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
-; AVX1-NEXT: vpacksswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -1376,11 +1376,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pmovsxbw %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmovsxbw %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pmullw %xmm2, %xmm4
@@ -1407,7 +1407,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
@@ -1436,9 +1436,9 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5
; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
@@ -1450,9 +1450,9 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -1478,7 +1478,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %xmm3, (%rdi)
; AVX2-NEXT: retq
@@ -1730,11 +1730,11 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: pmovsxbw %xmm3, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pmovsxbw %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm3, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
@@ -1754,12 +1754,12 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; SSE41-NEXT: pcmpeqb %xmm1, %xmm4
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: pmovsxbw %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; SSE41-NEXT: movdqa %xmm2, %xmm7
; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm7, %xmm0
; SSE41-NEXT: pmullw %xmm10, %xmm1
@@ -1783,7 +1783,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm8
; SSE41-NEXT: psrad $31, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm5
; SSE41-NEXT: psrad $31, %xmm5
@@ -1795,7 +1795,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm7
; SSE41-NEXT: psrad $31, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm3
; SSE41-NEXT: psrad $31, %xmm3
@@ -1841,9 +1841,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; AVX1-NEXT: vpmovsxbw %xmm5, %xmm4
; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5
; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
@@ -1865,9 +1865,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm7
; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
@@ -1882,14 +1882,14 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vmovdqa %xmm9, 16(%rdi)
@@ -1929,9 +1929,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
; AVX2-NEXT: retq
@@ -2411,8 +2411,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pand %xmm10, %xmm9
; SSE41-NEXT: packuswb %xmm11, %xmm9
; SSE41-NEXT: pmovsxbw %xmm3, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,0,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3]
; SSE41-NEXT: pmullw %xmm12, %xmm8
; SSE41-NEXT: pxor %xmm7, %xmm7
; SSE41-NEXT: pcmpgtb %xmm9, %xmm7
@@ -2436,8 +2436,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pand %xmm10, %xmm12
; SSE41-NEXT: packuswb %xmm3, %xmm12
; SSE41-NEXT: pmovsxbw %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE41-NEXT: pmullw %xmm7, %xmm3
; SSE41-NEXT: pxor %xmm7, %xmm7
; SSE41-NEXT: pcmpgtb %xmm12, %xmm7
@@ -2461,8 +2461,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pand %xmm10, %xmm11
; SSE41-NEXT: packuswb %xmm7, %xmm11
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: pmullw %xmm6, %xmm2
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: pcmpgtb %xmm11, %xmm6
@@ -2487,8 +2487,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: packuswb %xmm6, %xmm5
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: pmullw %xmm7, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; SSE41-NEXT: pmullw %xmm4, %xmm0
@@ -2523,7 +2523,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, (%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2538,7 +2538,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, 208(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2553,7 +2553,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, 144(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2568,7 +2568,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, 80(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2606,9 +2606,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX1-NEXT: vpmovsxbw %xmm6, %xmm7
; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6
; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
@@ -2628,9 +2628,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm7
; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
@@ -2652,9 +2652,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm11, %xmm7
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6
; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
@@ -2672,9 +2672,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm5
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm6
; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm2
@@ -2699,37 +2699,37 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi)
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 112(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -2794,15 +2794,15 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX2-NEXT: vpackuswb %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm2, %ymm8
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5
@@ -2932,7 +2932,7 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
@@ -2971,9 +2971,9 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
; SSE2-LABEL: smulo_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %r8
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rcx
; SSE2-NEXT: movq %xmm1, %rdx
; SSE2-NEXT: movq %xmm0, %rsi
@@ -2996,9 +2996,9 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
;
; SSSE3-LABEL: smulo_v2i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSSE3-NEXT: movq %xmm2, %r8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSSE3-NEXT: movq %xmm2, %rcx
; SSSE3-NEXT: movq %xmm1, %rdx
; SSSE3-NEXT: movq %xmm0, %rsi
@@ -3158,7 +3158,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: movw %cx, 6(%rdi)
; SSE2-NEXT: movd %xmm2, %edx
@@ -3213,7 +3213,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSSE3-NEXT: movd %xmm0, %ecx
; SSSE3-NEXT: movw %cx, 6(%rdi)
; SSSE3-NEXT: movd %xmm2, %edx
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 8ab9367c32f8..eb12f0dbcbff 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -109,7 +109,7 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movq %xmm3, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -122,7 +122,7 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movq %xmm3, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, 8(%rdi)
; SSSE3-NEXT: retq
;
@@ -517,13 +517,13 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2)
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -649,7 +649,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
@@ -672,9 +672,9 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vmovdqa %xmm3, (%rdi)
@@ -688,7 +688,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %xmm3, (%rdi)
; AVX2-NEXT: retq
@@ -774,7 +774,7 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
@@ -899,7 +899,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: movw %cx, 9(%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm1, %edx
; SSE2-NEXT: movw %dx, 6(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
@@ -934,7 +934,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSSE3-NEXT: movd %xmm1, %ecx
; SSSE3-NEXT: movw %cx, 9(%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, %edx
; SSSE3-NEXT: movw %dx, 6(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 864f0b59f453..9ff793b6b677 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -120,7 +120,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm1, 8(%rdi)
; SSE2-NEXT: retq
;
@@ -132,7 +132,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSSE3-NEXT: pxor %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSSE3-NEXT: movq %xmm1, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, 8(%rdi)
; SSSE3-NEXT: retq
;
@@ -601,13 +601,13 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2)
; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6
-; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -727,7 +727,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
@@ -750,9 +750,9 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -766,7 +766,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-NEXT: retq
@@ -850,7 +850,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
@@ -956,7 +956,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: movw %cx, 9(%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm1, %edx
; SSE2-NEXT: movw %dx, 6(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
@@ -988,7 +988,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSSE3-NEXT: movd %xmm1, %ecx
; SSSE3-NEXT: movw %cx, 9(%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, %edx
; SSSE3-NEXT: movw %dx, 6(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index e08bbc363721..87fe4922dfcb 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1008,16 +1008,16 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2)
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5
; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5
-; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm7
+; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6
-; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpacksswb %xmm5, %xmm11, %xmm1
+; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -1217,7 +1217,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
@@ -1254,9 +1254,9 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -1278,7 +1278,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-NEXT: retq
@@ -1560,7 +1560,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
@@ -1572,7 +1572,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm3
; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
@@ -1647,14 +1647,14 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi)
@@ -1689,9 +1689,9 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
; AVX2-NEXT: retq
@@ -2230,7 +2230,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, (%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2245,7 +2245,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, 208(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2260,7 +2260,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, 144(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2275,7 +2275,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, 80(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -2390,37 +2390,37 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX1-NEXT: vmovdqa %xmm4, 64(%rdi)
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
; AVX1-NEXT: vmovdqa %xmm4, (%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vmovdqa %xmm4, 224(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vmovdqa %xmm4, 240(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vmovdqa %xmm3, 208(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vmovdqa %xmm3, 160(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vmovdqa %xmm3, 176(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vmovdqa %xmm2, 144(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vmovdqa %xmm2, 96(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vmovdqa %xmm2, 112(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -2475,15 +2475,15 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou
; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
@@ -2608,7 +2608,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -2646,9 +2646,9 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
; SSE2-LABEL: umulo_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %r8
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %r10
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movq %xmm1, %rdx
@@ -2672,9 +2672,9 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
;
; SSSE3-LABEL: umulo_v2i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSSE3-NEXT: movq %xmm2, %r8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSSE3-NEXT: movq %xmm2, %r10
; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: movq %xmm1, %rdx
@@ -2829,7 +2829,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: movw %ax, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm2, %ecx
; SSE2-NEXT: movw %cx, 6(%rdi)
; SSE2-NEXT: movd %xmm1, %edx
@@ -2873,7 +2873,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: movd %xmm2, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSSE3-NEXT: movd %xmm2, %ecx
; SSSE3-NEXT: movw %cx, 6(%rdi)
; SSSE3-NEXT: movd %xmm1, %edx
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 5302b3f8913d..155c5591ce11 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -126,7 +126,7 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -140,7 +140,7 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; SSSE3-NEXT: pxor %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSSE3-NEXT: movd %xmm0, 8(%rdi)
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
@@ -644,13 +644,13 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2)
; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6
-; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
@@ -771,7 +771,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
@@ -793,9 +793,9 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -809,7 +809,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-NEXT: retq
@@ -895,7 +895,7 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
@@ -1003,7 +1003,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: movw %cx, 9(%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm1, %edx
; SSE2-NEXT: movw %dx, 6(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
@@ -1035,7 +1035,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
; SSSE3-NEXT: movd %xmm1, %ecx
; SSSE3-NEXT: movw %cx, 9(%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, %edx
; SSSE3-NEXT: movw %dx, 6(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 22e97e740753..62bc377b9cec 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -6296,7 +6296,7 @@ define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
@@ -6342,7 +6342,7 @@ define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
@@ -6375,7 +6375,7 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 {
; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2sd %eax, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %eax, %xmm0
@@ -6414,7 +6414,7 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2ss %eax, %xmm2
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %eax, %xmm0
@@ -6535,7 +6535,7 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: retq
@@ -6575,14 +6575,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
@@ -6642,7 +6642,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: cvtsi2ss %rax, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
@@ -6650,7 +6650,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
@@ -6970,7 +6970,7 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addss %xmm0, %xmm0
; CHECK-NEXT: .LBB174_2: # %entry
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
@@ -7031,7 +7031,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 {
; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2sd %rax, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
@@ -7082,7 +7082,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2ss %rax, %xmm2
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
@@ -7157,7 +7157,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0]
@@ -7458,7 +7458,7 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addss %xmm2, %xmm2
; CHECK-NEXT: .LBB182_2: # %entry
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
@@ -7487,7 +7487,7 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-NEXT: addss %xmm1, %xmm1
; CHECK-NEXT: .LBB182_6: # %entry
; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index d8442048f65e..0192d1e8137c 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -30,7 +30,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psllq %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psllq %xmm4, %xmm5
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
@@ -38,7 +38,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE2-NEXT: psubq %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psrlq %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE2-NEXT: psrlq %xmm3, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
; SSE2-NEXT: orpd %xmm5, %xmm1
@@ -56,7 +56,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: psllq %xmm5, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7]
@@ -64,7 +64,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: psubq %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: psrlq %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: psrlq %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: por %xmm1, %xmm4
@@ -78,13 +78,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -212,7 +212,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psllq %xmm2, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psllq %xmm4, %xmm5
; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
@@ -220,7 +220,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; X32-SSE-NEXT: psubq %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm1, %xmm4
; X32-SSE-NEXT: psrlq %xmm3, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm3, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
; X32-SSE-NEXT: orpd %xmm5, %xmm1
@@ -249,7 +249,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrld %xmm6, %xmm3
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: psrld %xmm5, %xmm6
@@ -285,7 +285,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm7
; SSE41-NEXT: psrld %xmm6, %xmm7
@@ -465,7 +465,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; X32-SSE-NEXT: movdqa %xmm1, %xmm3
; X32-SSE-NEXT: psrld %xmm6, %xmm3
; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm1, %xmm6
; X32-SSE-NEXT: psrld %xmm5, %xmm6
@@ -1366,7 +1366,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; X32-SSE-NEXT: psubq %xmm3, %xmm4
; X32-SSE-NEXT: movdqa %xmm1, %xmm3
; X32-SSE-NEXT: psrlq %xmm4, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm4, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 0688107ed5c0..0cf4c172412a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -26,11 +26,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpsllq %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
@@ -38,12 +38,12 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6
; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index c560f99916be..59bef3a97b1f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -31,14 +31,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psllq %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psllq %xmm1, %xmm5
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE2-NEXT: psrlq %xmm2, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: orpd %xmm5, %xmm0
@@ -52,14 +52,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; SSE41-NEXT: pand %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psllq %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psllq %xmm1, %xmm5
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlq %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: psrlq %xmm2, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: por %xmm5, %xmm0
@@ -70,14 +70,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
@@ -136,14 +136,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; X32-SSE-NEXT: pand %xmm2, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllq %xmm1, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psllq %xmm1, %xmm5
; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; X32-SSE-NEXT: pand %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlq %xmm3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm5, %xmm0
@@ -745,14 +745,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
; X32-SSE-NEXT: pand %xmm2, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllq %xmm1, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psllq %xmm1, %xmm5
; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; X32-SSE-NEXT: pand %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlq %xmm3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm5, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 655b6e4c2504..fd0e1c7e2f3a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -24,11 +24,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
@@ -38,13 +38,13 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -520,7 +520,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm7
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index b7cc39a32d71..817bca051e0a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -30,7 +30,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrlq %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: psrlq %xmm4, %xmm5
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
@@ -38,7 +38,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE2-NEXT: psubq %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psllq %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE2-NEXT: psllq %xmm3, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: orpd %xmm5, %xmm0
@@ -58,7 +58,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: psrlq %xmm4, %xmm5
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
@@ -66,7 +66,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: psubq %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: psllq %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: psllq %xmm0, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: por %xmm5, %xmm3
@@ -80,13 +80,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
@@ -215,7 +215,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: movdqa %xmm1, %xmm3
; X32-SSE-NEXT: psrlq %xmm2, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm1, %xmm5
; X32-SSE-NEXT: psrlq %xmm4, %xmm5
; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
@@ -223,7 +223,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; X32-SSE-NEXT: psubq %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllq %xmm3, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psllq %xmm3, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm5, %xmm0
@@ -251,7 +251,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrld %xmm5, %xmm3
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: psrld %xmm5, %xmm6
@@ -287,7 +287,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: psrld %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrld %xmm5, %xmm6
@@ -469,7 +469,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; X32-SSE-NEXT: movdqa %xmm1, %xmm3
; X32-SSE-NEXT: psrld %xmm5, %xmm3
; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm1, %xmm6
; X32-SSE-NEXT: psrld %xmm5, %xmm6
@@ -1380,7 +1380,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; X32-SSE-NEXT: psubq %xmm4, %xmm5
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllq %xmm5, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; X32-SSE-NEXT: psllq %xmm5, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; X32-SSE-NEXT: movdqa %xmm1, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index e9cb0a0586f0..f0848cfd2e49 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -26,11 +26,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpsrlq %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
@@ -38,12 +38,12 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6
; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index eadc8544f8af..8fe7ba9e471a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -31,14 +31,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrlq %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psrlq %xmm1, %xmm5
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psllq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE2-NEXT: psllq %xmm2, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: orpd %xmm5, %xmm0
@@ -52,14 +52,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; SSE41-NEXT: pand %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psrlq %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrlq %xmm1, %xmm5
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psllq %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: psllq %xmm2, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: por %xmm5, %xmm0
@@ -70,14 +70,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
@@ -138,14 +138,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
; X32-SSE-NEXT: pand %xmm2, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrlq %xmm1, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psrlq %xmm1, %xmm5
; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; X32-SSE-NEXT: pand %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllq %xmm3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psllq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm5, %xmm0
@@ -789,14 +789,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
; X32-SSE-NEXT: pand %xmm2, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrlq %xmm1, %xmm4
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psrlq %xmm1, %xmm5
; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; X32-SSE-NEXT: pand %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllq %xmm3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psllq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm5, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 61c45a118e47..4e92bfc4f913 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -24,11 +24,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
@@ -38,13 +38,13 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -566,7 +566,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm6, %xmm4, %xmm7
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index ac8e1998ceb8..22a389568735 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -20,7 +20,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
; SSE2-NEXT: movq %rdx, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: imulq %rcx
; SSE2-NEXT: movq %rdx, %rax
@@ -199,7 +199,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; SSE41-NEXT: pmullw %xmm2, %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
; SSE41-NEXT: pmullw %xmm2, %xmm3
; SSE41-NEXT: psrlw $8, %xmm3
@@ -223,7 +223,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
@@ -327,7 +327,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
@@ -371,7 +371,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
@@ -457,7 +457,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: subq %rax, %rdx
; SSE2-NEXT: addq %rcx, %rdx
; SSE2-NEXT: movq %rdx, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: imulq %rsi
@@ -674,7 +674,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; SSE41-NEXT: pmullw %xmm2, %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
; SSE41-NEXT: pmullw %xmm2, %xmm3
; SSE41-NEXT: psrlw $8, %xmm3
@@ -702,7 +702,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
@@ -836,7 +836,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4
; SSE41-NEXT: psrlw $8, %xmm4
@@ -879,7 +879,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index a9cf1aa80af8..e06140f988d5 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -163,7 +163,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
@@ -182,7 +182,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7
; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
@@ -260,7 +260,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm4
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
@@ -289,7 +289,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
@@ -574,7 +574,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
@@ -598,7 +598,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
@@ -688,7 +688,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm4
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
@@ -725,7 +725,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm6, %xmm6
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 16be83c84fb3..65131c0e3cb7 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -20,7 +20,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: addq %rdx, %rcx
; SSE2-NEXT: movq %rcx, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
@@ -445,7 +445,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: subq %rdx, %rax
; SSE2-NEXT: addq %rcx, %rax
; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index 2900ce2c8661..724f6007623d 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -452,7 +452,7 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) {
; SSE42: # %bb.0:
; SSE42-NEXT: psrlw $15, %xmm0
; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index e4f785dca2b1..a00e74fa1cac 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -14,21 +14,21 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-LABEL: test_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddq %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: retq
@@ -40,7 +40,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-LABEL: test_v4i64:
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddq %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -84,7 +84,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: paddq %xmm2, %xmm1
; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -96,7 +96,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -107,7 +107,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -119,7 +119,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -138,7 +138,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: paddq %xmm4, %xmm2
; SSE-NEXT: paddq %xmm3, %xmm2
; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -156,7 +156,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -169,7 +169,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -182,7 +182,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -236,7 +236,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE-LABEL: test_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: paddd %xmm1, %xmm0
@@ -245,7 +245,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX1-SLOW-LABEL: test_v4i32:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -261,7 +261,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX2-LABEL: test_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -270,7 +270,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -284,7 +284,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE-LABEL: test_v8i32:
; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: paddd %xmm1, %xmm0
@@ -295,7 +295,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -317,7 +317,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -329,7 +329,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -346,7 +346,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: paddd %xmm0, %xmm1
@@ -360,7 +360,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -375,7 +375,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
@@ -387,7 +387,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -401,7 +401,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -422,7 +422,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE-NEXT: paddd %xmm4, %xmm2
; SSE-NEXT: paddd %xmm3, %xmm2
; SSE-NEXT: paddd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: paddd %xmm0, %xmm1
@@ -442,7 +442,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -463,7 +463,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
@@ -477,7 +477,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -492,7 +492,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -608,7 +608,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddw %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: paddw %xmm1, %xmm0
@@ -621,7 +621,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX1-SLOW-LABEL: test_v8i16:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -642,7 +642,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX2-LABEL: test_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -654,7 +654,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX512-LABEL: test_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -671,7 +671,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE-LABEL: test_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddw %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: paddw %xmm1, %xmm0
@@ -686,7 +686,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -713,7 +713,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -728,7 +728,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -748,7 +748,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE-NEXT: paddw %xmm3, %xmm1
; SSE-NEXT: paddw %xmm2, %xmm1
; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: paddw %xmm0, %xmm1
@@ -766,7 +766,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -784,7 +784,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -799,7 +799,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -816,7 +816,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -840,7 +840,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE-NEXT: paddw %xmm4, %xmm2
; SSE-NEXT: paddw %xmm3, %xmm2
; SSE-NEXT: paddw %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: paddw %xmm0, %xmm1
@@ -864,7 +864,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -888,7 +888,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -905,7 +905,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -923,7 +923,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -1142,7 +1142,7 @@ define i8 @test_v8i8_load(<8 x i8>* %p) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE-LABEL: test_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddb %xmm0, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: psadbw %xmm1, %xmm0
@@ -1152,7 +1152,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX-LABEL: test_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1162,7 +1162,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX512-LABEL: test_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1177,7 +1177,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE-LABEL: test_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: paddb %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: paddb %xmm0, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: psadbw %xmm1, %xmm0
@@ -1189,7 +1189,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1202,7 +1202,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1215,7 +1215,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1233,7 +1233,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE-NEXT: paddb %xmm3, %xmm1
; SSE-NEXT: paddb %xmm2, %xmm1
; SSE-NEXT: paddb %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: paddb %xmm1, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: psadbw %xmm0, %xmm1
@@ -1248,7 +1248,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1262,7 +1262,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1277,7 +1277,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1299,7 +1299,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE-NEXT: paddb %xmm4, %xmm2
; SSE-NEXT: paddb %xmm3, %xmm2
; SSE-NEXT: paddb %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: paddb %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: psadbw %xmm0, %xmm1
@@ -1320,7 +1320,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1336,7 +1336,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
@@ -1352,7 +1352,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
index c94f96958f5b..17a3d6f46e98 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
@@ -13,7 +13,7 @@
define i1 @test_v2i64(<2 x i64> %a0) {
; SSE-LABEL: test_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
@@ -22,7 +22,7 @@ define i1 @test_v2i64(<2 x i64> %a0) {
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: testq %rax, %rax
@@ -37,7 +37,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
; SSE-LABEL: test_v4i64:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
@@ -48,7 +48,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
@@ -60,7 +60,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
@@ -72,7 +72,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: testq %rax, %rax
@@ -90,7 +90,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
@@ -102,7 +102,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
@@ -115,7 +115,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
@@ -129,7 +129,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: testq %rax, %rax
@@ -151,7 +151,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
@@ -165,7 +165,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
@@ -180,7 +180,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
@@ -195,7 +195,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: testq %rax, %rax
@@ -237,7 +237,7 @@ define i1 @test_v2i32(<2 x i32> %a0) {
define i1 @test_v4i32(<4 x i32> %a0) {
; SSE-LABEL: test_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -248,7 +248,7 @@ define i1 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -265,7 +265,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
; SSE-LABEL: test_v8i32:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -278,7 +278,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -292,7 +292,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -306,7 +306,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -326,7 +326,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -340,7 +340,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -355,7 +355,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -371,7 +371,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -395,7 +395,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -411,7 +411,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -428,7 +428,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -445,7 +445,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -518,7 +518,7 @@ define i1 @test_v4i16(<4 x i16> %a0) {
define i1 @test_v8i16(<8 x i16> %a0) {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -532,7 +532,7 @@ define i1 @test_v8i16(<8 x i16> %a0) {
;
; AVX-LABEL: test_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -551,7 +551,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
; SSE-LABEL: test_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -567,7 +567,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -583,7 +583,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -599,7 +599,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -621,7 +621,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -638,7 +638,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -655,7 +655,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -673,7 +673,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -699,7 +699,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -718,7 +718,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -737,7 +737,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -756,7 +756,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -865,7 +865,7 @@ define i1 @test_v8i8(<8 x i8> %a0) {
define i1 @test_v16i8(<16 x i8> %a0) {
; SSE-LABEL: test_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -882,7 +882,7 @@ define i1 @test_v16i8(<16 x i8> %a0) {
;
; AVX-LABEL: test_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -903,7 +903,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
; SSE-LABEL: test_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -922,7 +922,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -940,7 +940,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -958,7 +958,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -982,7 +982,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -1002,7 +1002,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -1021,7 +1021,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1041,7 +1041,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1069,7 +1069,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -1091,7 +1091,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -1112,7 +1112,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1133,7 +1133,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll
index 2fc924b1b125..4a00c22a2670 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll
@@ -13,14 +13,14 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-LABEL: test_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
@@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-LABEL: test_v4i64:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
@@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE-LABEL: test_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE-LABEL: test_v8i32:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX-LABEL: test_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE-LABEL: test_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE-LABEL: test_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX-LABEL: test_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE-LABEL: test_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index bafe112c5dfc..f7e1a72f9a91 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -15,7 +15,7 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-LABEL: test_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -31,7 +31,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -45,7 +45,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -59,7 +59,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512BWVL-LABEL: test_v2i64:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -74,7 +74,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512DQ-LABEL: test_v2i64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovq %xmm0, %rax
; AVX512DQ-NEXT: vzeroupper
@@ -82,7 +82,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512DQVL-LABEL: test_v2i64:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovq %xmm0, %rax
; AVX512DQVL-NEXT: retq
@@ -103,7 +103,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-NEXT: psllq $32, %xmm3
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -128,7 +128,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -152,7 +152,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -176,7 +176,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -200,7 +200,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -218,7 +218,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovq %xmm0, %rax
; AVX512DQ-NEXT: vzeroupper
@@ -228,7 +228,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovq %xmm0, %rax
; AVX512DQVL-NEXT: vzeroupper
@@ -270,7 +270,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: psllq $32, %xmm3
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -312,7 +312,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -344,7 +344,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -377,7 +377,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -410,7 +410,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -429,7 +429,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovq %xmm0, %rax
; AVX512DQ-NEXT: vzeroupper
@@ -441,7 +441,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovq %xmm0, %rax
; AVX512DQVL-NEXT: vzeroupper
@@ -523,7 +523,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: psllq $32, %xmm3
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -599,7 +599,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -647,7 +647,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -688,7 +688,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -729,7 +729,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -749,7 +749,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovq %xmm0, %rax
; AVX512DQ-NEXT: vzeroupper
@@ -762,7 +762,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovq %xmm0, %rax
; AVX512DQVL-NEXT: vzeroupper
@@ -810,7 +810,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-LABEL: test_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm3
@@ -821,7 +821,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; SSE41-LABEL: test_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmulld %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pmulld %xmm1, %xmm0
@@ -830,7 +830,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -839,7 +839,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -856,7 +856,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,0,2,2]
; SSE2-NEXT: pmuludq %xmm3, %xmm0
@@ -867,7 +867,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE41-LABEL: test_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmulld %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pmulld %xmm1, %xmm0
@@ -878,7 +878,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -890,7 +890,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -902,7 +902,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -926,7 +926,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
@@ -939,7 +939,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE41-NEXT: pmulld %xmm3, %xmm1
; SSE41-NEXT: pmulld %xmm2, %xmm1
; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmulld %xmm0, %xmm1
@@ -953,7 +953,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -966,7 +966,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -980,7 +980,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -1016,7 +1016,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2-NEXT: pmuludq %xmm5, %xmm1
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,0,2,2]
; SSE2-NEXT: pmuludq %xmm11, %xmm1
@@ -1033,7 +1033,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE41-NEXT: pmulld %xmm4, %xmm2
; SSE41-NEXT: pmulld %xmm3, %xmm2
; SSE41-NEXT: pmulld %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmulld %xmm0, %xmm1
@@ -1053,7 +1053,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -1068,7 +1068,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -1083,7 +1083,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -1165,7 +1165,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pmullw %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pmullw %xmm1, %xmm0
@@ -1178,7 +1178,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX-LABEL: test_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1190,7 +1190,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX512-LABEL: test_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1207,7 +1207,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE-LABEL: test_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pmullw %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pmullw %xmm1, %xmm0
@@ -1222,7 +1222,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1237,7 +1237,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1252,7 +1252,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1272,7 +1272,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE-NEXT: pmullw %xmm3, %xmm1
; SSE-NEXT: pmullw %xmm2, %xmm1
; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pmullw %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pmullw %xmm0, %xmm1
@@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1306,7 +1306,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1323,7 +1323,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1340,7 +1340,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1357,7 +1357,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1374,7 +1374,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1398,7 +1398,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE-NEXT: pmullw %xmm4, %xmm2
; SSE-NEXT: pmullw %xmm3, %xmm2
; SSE-NEXT: pmullw %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pmullw %xmm0, %xmm1
@@ -1422,7 +1422,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1440,7 +1440,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1458,7 +1458,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1476,7 +1476,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1496,7 +1496,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1516,7 +1516,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1634,7 +1634,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -1650,7 +1650,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1663,7 +1663,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1711,7 +1711,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: packuswb %xmm3, %xmm0
@@ -1735,7 +1735,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1764,7 +1764,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX512BW-LABEL: test_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -1786,7 +1786,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX512BWVL-LABEL: test_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -1808,7 +1808,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX512DQ-LABEL: test_v16i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -1833,7 +1833,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX512DQVL-LABEL: test_v16i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -1908,7 +1908,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE41-NEXT: packuswb %xmm4, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: pmullw %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: packuswb %xmm3, %xmm2
@@ -1937,7 +1937,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1982,7 +1982,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
@@ -2010,7 +2010,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
@@ -2158,7 +2158,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE41-NEXT: packuswb %xmm3, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pand %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: packuswb %xmm3, %xmm1
@@ -2196,7 +2196,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -2288,10 +2288,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpmovwb %ymm2, %xmm2
+; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
@@ -2495,7 +2493,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm0
@@ -2551,7 +2549,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -2657,10 +2655,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 8223e6bd2380..e1253975d5a6 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -823,7 +823,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
define i1 @trunc_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: trunc_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: testw %ax, %ax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll
index 14eb3d27d8df..95bff8e03afa 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll
@@ -13,14 +13,14 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-LABEL: test_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
@@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-LABEL: test_v4i64:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
@@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: por %xmm4, %xmm2
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE-LABEL: test_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
@@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE-LABEL: test_v8i32:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
@@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
@@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE-NEXT: por %xmm4, %xmm2
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
@@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
@@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX-LABEL: test_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE-LABEL: test_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
@@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
@@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE-NEXT: por %xmm4, %xmm2
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
@@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE-LABEL: test_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
@@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX-LABEL: test_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE-LABEL: test_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
@@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
@@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE-NEXT: por %xmm4, %xmm2
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
@@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
index c0705ab96e03..26bbfed52196 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
@@ -14,7 +14,7 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: test_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-LABEL: test_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: pxor %xmm3, %xmm4
@@ -54,7 +54,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE42-LABEL: test_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE42-NEXT: movq %xmm2, %rax
@@ -62,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
@@ -71,7 +71,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -79,7 +79,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512VL-LABEL: test_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: retq
@@ -106,7 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm2
@@ -139,7 +139,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: pxor %xmm3, %xmm4
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -158,7 +158,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -170,7 +170,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -182,7 +182,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -194,7 +194,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -204,7 +204,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -262,7 +262,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm4
@@ -317,7 +317,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm2
; SSE41-NEXT: pxor %xmm5, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm5
@@ -343,7 +343,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE42-NEXT: movapd %xmm2, %xmm0
; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE42-NEXT: movdqa %xmm3, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
@@ -360,7 +360,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -387,7 +387,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -399,7 +399,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -517,7 +517,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm8
@@ -616,7 +616,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm2
; SSE41-NEXT: pxor %xmm9, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm9
@@ -654,7 +654,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE42-NEXT: movapd %xmm6, %xmm0
; SSE42-NEXT: pcmpgtq %xmm7, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE42-NEXT: movdqa %xmm7, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1
@@ -681,7 +681,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -699,7 +699,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -713,7 +713,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -726,7 +726,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -778,7 +778,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-LABEL: test_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -795,7 +795,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; SSE4-LABEL: test_v4i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
@@ -804,7 +804,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -813,7 +813,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -831,7 +831,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -849,7 +849,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE4-LABEL: test_v8i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
@@ -860,7 +860,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -913,7 +913,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -933,7 +933,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE4-NEXT: pmaxsd %xmm3, %xmm1
; SSE4-NEXT: pmaxsd %xmm2, %xmm1
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
@@ -947,7 +947,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -960,7 +960,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -974,7 +974,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1023,7 +1023,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -1047,7 +1047,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE4-NEXT: pmaxsd %xmm4, %xmm2
; SSE4-NEXT: pmaxsd %xmm3, %xmm2
; SSE4-NEXT: pmaxsd %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE4-NEXT: pmaxsd %xmm2, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pmaxsd %xmm0, %xmm1
@@ -1067,7 +1067,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1082,7 +1082,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1097,7 +1097,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1179,7 +1179,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE2-LABEL: test_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1224,7 +1224,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE2-LABEL: test_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE2-NEXT: pmaxsw %xmm3, %xmm1
; SSE2-NEXT: pmaxsw %xmm2, %xmm1
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
@@ -1368,7 +1368,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE2-NEXT: pmaxsw %xmm4, %xmm2
; SSE2-NEXT: pmaxsw %xmm3, %xmm2
; SSE2-NEXT: pmaxsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
@@ -1622,7 +1622,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -1697,7 +1697,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -1803,7 +1803,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -1937,7 +1937,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
index e8259e147133..6f561c27d71d 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
@@ -14,7 +14,7 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: test_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-LABEL: test_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -53,7 +53,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE42-LABEL: test_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -62,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
@@ -71,7 +71,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -79,7 +79,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512VL-LABEL: test_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: retq
@@ -106,7 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm2
@@ -138,7 +138,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -158,7 +158,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -170,7 +170,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -182,7 +182,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -194,7 +194,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -204,7 +204,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -262,7 +262,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm4
@@ -317,7 +317,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm5
@@ -343,7 +343,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE42-NEXT: movapd %xmm3, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
@@ -360,7 +360,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -387,7 +387,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -399,7 +399,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -517,7 +517,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm8
@@ -616,7 +616,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm9
@@ -654,7 +654,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE42-NEXT: movapd %xmm7, %xmm0
; SSE42-NEXT: pcmpgtq %xmm6, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm7, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1
@@ -681,7 +681,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -699,7 +699,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -713,7 +713,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -726,7 +726,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -778,7 +778,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-LABEL: test_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -795,7 +795,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; SSE4-LABEL: test_v4i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pminsd %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pminsd %xmm1, %xmm0
@@ -804,7 +804,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -813,7 +813,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -831,7 +831,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -849,7 +849,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE4-LABEL: test_v8i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pminsd %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pminsd %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pminsd %xmm1, %xmm0
@@ -860,7 +860,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -913,7 +913,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -933,7 +933,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE4-NEXT: pminsd %xmm3, %xmm1
; SSE4-NEXT: pminsd %xmm2, %xmm1
; SSE4-NEXT: pminsd %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: pminsd %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pminsd %xmm0, %xmm1
@@ -947,7 +947,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -960,7 +960,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -974,7 +974,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1023,7 +1023,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -1047,7 +1047,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE4-NEXT: pminsd %xmm4, %xmm2
; SSE4-NEXT: pminsd %xmm3, %xmm2
; SSE4-NEXT: pminsd %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE4-NEXT: pminsd %xmm2, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pminsd %xmm0, %xmm1
@@ -1067,7 +1067,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1082,7 +1082,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1097,7 +1097,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -1179,7 +1179,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE2-LABEL: test_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1224,7 +1224,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE2-LABEL: test_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE2-NEXT: pminsw %xmm3, %xmm1
; SSE2-NEXT: pminsw %xmm2, %xmm1
; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pminsw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
@@ -1368,7 +1368,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE2-NEXT: pminsw %xmm4, %xmm2
; SSE2-NEXT: pminsw %xmm3, %xmm2
; SSE2-NEXT: pminsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: pminsw %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
@@ -1622,7 +1622,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -1697,7 +1697,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -1803,7 +1803,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -1937,7 +1937,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index 75e813358133..a11fff0a5b5c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -14,7 +14,7 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: test_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-LABEL: test_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: pxor %xmm3, %xmm4
@@ -54,7 +54,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE42-LABEL: test_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm3, %xmm0
; SSE42-NEXT: pxor %xmm2, %xmm3
@@ -65,7 +65,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -77,7 +77,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512VL-LABEL: test_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: retq
@@ -112,7 +112,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm2
@@ -145,7 +145,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: pxor %xmm3, %xmm4
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -168,7 +168,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE42-NEXT: pxor %xmm3, %xmm0
; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pxor %xmm3, %xmm0
; SSE42-NEXT: pxor %xmm2, %xmm3
@@ -185,7 +185,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -202,7 +202,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -216,7 +216,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -226,7 +226,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -284,7 +284,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm4
@@ -339,7 +339,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm2
; SSE41-NEXT: pxor %xmm5, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm5
@@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE42-NEXT: xorpd %xmm5, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE42-NEXT: movdqa %xmm3, %xmm0
; SSE42-NEXT: pxor %xmm5, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm5
@@ -400,7 +400,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -421,7 +421,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -436,7 +436,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -448,7 +448,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -566,7 +566,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm8
@@ -665,7 +665,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm2
; SSE41-NEXT: pxor %xmm9, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm9
@@ -725,7 +725,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE42-NEXT: xorpd %xmm9, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE42-NEXT: movdqa %xmm7, %xmm0
; SSE42-NEXT: pxor %xmm9, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm9
@@ -769,7 +769,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -798,7 +798,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -814,7 +814,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -827,7 +827,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -882,7 +882,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-LABEL: test_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -905,7 +905,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; SSE4-LABEL: test_v4i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pmaxud %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pmaxud %xmm1, %xmm0
@@ -914,7 +914,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -923,7 +923,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -945,7 +945,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -968,7 +968,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE4-LABEL: test_v8i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pmaxud %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pmaxud %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pmaxud %xmm1, %xmm0
@@ -979,7 +979,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -991,7 +991,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1003,7 +1003,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1042,7 +1042,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1067,7 +1067,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE4-NEXT: pmaxud %xmm3, %xmm1
; SSE4-NEXT: pmaxud %xmm2, %xmm1
; SSE4-NEXT: pmaxud %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: pmaxud %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pmaxud %xmm0, %xmm1
@@ -1081,7 +1081,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1094,7 +1094,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1108,7 +1108,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1179,7 +1179,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1208,7 +1208,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE4-NEXT: pmaxud %xmm4, %xmm2
; SSE4-NEXT: pmaxud %xmm3, %xmm2
; SSE4-NEXT: pmaxud %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE4-NEXT: pmaxud %xmm2, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pmaxud %xmm0, %xmm1
@@ -1228,7 +1228,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1243,7 +1243,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1258,7 +1258,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
@@ -1368,7 +1368,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE2-LABEL: test_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm1
@@ -1433,7 +1433,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1520,7 +1520,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE2-NEXT: pmaxsw %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
@@ -1625,7 +1625,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE2-NEXT: pmaxsw %xmm5, %xmm1
; SSE2-NEXT: pmaxsw %xmm4, %xmm1
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pmaxsw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmaxsw %xmm0, %xmm1
@@ -1839,7 +1839,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxub %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -1909,7 +1909,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE2-LABEL: test_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmaxub %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmaxub %xmm1, %xmm0
@@ -2004,7 +2004,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE2-NEXT: pmaxub %xmm3, %xmm1
; SSE2-NEXT: pmaxub %xmm2, %xmm1
; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pmaxub %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmaxub %xmm0, %xmm1
@@ -2113,7 +2113,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE2-NEXT: pmaxub %xmm4, %xmm2
; SSE2-NEXT: pmaxub %xmm3, %xmm2
; SSE2-NEXT: pmaxub %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: pmaxub %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmaxub %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index 8f6a7266d97b..9da8d61223ef 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -14,7 +14,7 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE2-LABEL: test_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-LABEL: test_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -53,7 +53,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE42-LABEL: test_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pxor %xmm0, %xmm3
@@ -65,7 +65,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -77,7 +77,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
;
; AVX512VL-LABEL: test_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: retq
@@ -112,7 +112,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm2
@@ -144,7 +144,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -168,7 +168,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE42-NEXT: pxor %xmm3, %xmm0
; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pxor %xmm3, %xmm0
; SSE42-NEXT: pxor %xmm2, %xmm3
@@ -186,7 +186,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
@@ -203,7 +203,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -217,7 +217,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -227,7 +227,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -285,7 +285,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm4
@@ -340,7 +340,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm5
@@ -376,7 +376,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE42-NEXT: xorpd %xmm4, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE42-NEXT: movdqa %xmm3, %xmm0
; SSE42-NEXT: pxor %xmm4, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm4
@@ -403,7 +403,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -424,7 +424,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -439,7 +439,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -451,7 +451,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -569,7 +569,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm8
@@ -668,7 +668,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm9
@@ -728,7 +728,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE42-NEXT: xorpd %xmm8, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE42-NEXT: movdqa %xmm7, %xmm0
; SSE42-NEXT: pxor %xmm8, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm8
@@ -773,7 +773,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -802,7 +802,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -818,7 +818,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -831,7 +831,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
@@ -886,7 +886,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-LABEL: test_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -909,7 +909,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; SSE4-LABEL: test_v4i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pminud %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pminud %xmm1, %xmm0
@@ -918,7 +918,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -927,7 +927,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -949,7 +949,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -972,7 +972,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE4-LABEL: test_v8i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pminud %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE4-NEXT: pminud %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE4-NEXT: pminud %xmm1, %xmm0
@@ -983,7 +983,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -995,7 +995,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1007,7 +1007,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1046,7 +1046,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm6, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1071,7 +1071,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE4-NEXT: pminud %xmm3, %xmm1
; SSE4-NEXT: pminud %xmm2, %xmm1
; SSE4-NEXT: pminud %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pminud %xmm0, %xmm1
@@ -1085,7 +1085,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1098,7 +1098,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1112,7 +1112,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1183,7 +1183,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -1212,7 +1212,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE4-NEXT: pminud %xmm4, %xmm2
; SSE4-NEXT: pminud %xmm3, %xmm2
; SSE4-NEXT: pminud %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE4-NEXT: pminud %xmm2, %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE4-NEXT: pminud %xmm0, %xmm1
@@ -1232,7 +1232,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1247,7 +1247,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1262,7 +1262,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
@@ -1372,7 +1372,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE2-LABEL: test_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm1
@@ -1418,7 +1418,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1482,7 +1482,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE2-NEXT: pminsw %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pminsw %xmm1, %xmm0
@@ -1562,7 +1562,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE2-NEXT: pminsw %xmm5, %xmm1
; SSE2-NEXT: pminsw %xmm4, %xmm1
; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pminsw %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pminsw %xmm0, %xmm1
@@ -1750,7 +1750,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminub %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pminub %xmm1, %xmm0
@@ -1799,7 +1799,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE2-LABEL: test_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pminub %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pminub %xmm1, %xmm0
@@ -1869,7 +1869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE2-NEXT: pminub %xmm3, %xmm1
; SSE2-NEXT: pminub %xmm2, %xmm1
; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pminub %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pminub %xmm0, %xmm1
@@ -1951,7 +1951,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE2-NEXT: pminub %xmm4, %xmm2
; SSE2-NEXT: pminub %xmm3, %xmm2
; SSE2-NEXT: pminub %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: pminub %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pminub %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
index 35e6db38a584..2d69190d9d18 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
@@ -13,14 +13,14 @@
define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-LABEL: test_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
@@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-LABEL: test_v4i64:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
@@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: pxor %xmm4, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm2
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
@@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
@@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vzeroupper
@@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
define i32 @test_v4i32(<4 x i32> %a0) {
; SSE-LABEL: test_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
@@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE-LABEL: test_v8i32:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
@@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
@@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE-NEXT: pxor %xmm4, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm2
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
@@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
define i16 @test_v8i16(<8 x i16> %a0) {
; SSE-LABEL: test_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
@@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX-LABEL: test_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; SSE-LABEL: test_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
@@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
@@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; SSE-NEXT: pxor %xmm4, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm2
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
@@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE-LABEL: test_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
@@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
;
; AVX-LABEL: test_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; SSE-LABEL: test_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
@@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
@@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; SSE-NEXT: pxor %xmm4, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm2
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
@@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rem.ll b/llvm/test/CodeGen/X86/vector-rem.ll
index 15e119869665..deaab1c9161b 100644
--- a/llvm/test/CodeGen/X86/vector-rem.ll
+++ b/llvm/test/CodeGen/X86/vector-rem.ll
@@ -11,9 +11,9 @@ define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) nounwind {
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm3, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; CHECK-NEXT: movd %xmm3, %ecx
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
@@ -49,9 +49,9 @@ define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) nounwind {
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm3, %eax
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; CHECK-NEXT: movd %xmm3, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ecx
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 67dd15ee87ab..d140fb5c0929 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -24,13 +24,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: psubq %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psllq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psllq %xmm1, %xmm4
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: psrlq %xmm2, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: orpd %xmm4, %xmm0
@@ -42,13 +42,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: psubq %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psllq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psllq %xmm1, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE41-NEXT: psrlq %xmm2, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: por %xmm4, %xmm0
@@ -59,11 +59,11 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
@@ -117,13 +117,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; X32-SSE-NEXT: psubq %xmm1, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psllq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllq %xmm1, %xmm4
; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm2, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm4, %xmm0
@@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; X32-SSE-NEXT: psllq %xmm1, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlq %xmm3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm3, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: orpd %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 31fe57502475..a850ab5ba782 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -21,20 +21,20 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index d162f5c4a97a..2a4efccc07c7 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -72,7 +72,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; SSE41-LABEL: sext_16i8_to_16i16:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -80,7 +80,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; AVX1-LABEL: sext_16i8_to_16i16:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -107,7 +107,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; X32-SSE41-LABEL: sext_16i8_to_16i16:
; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
@@ -149,9 +149,9 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm5
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxbw %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -160,12 +160,12 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; AVX1-LABEL: sext_32i8_to_32i16:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -210,9 +210,9 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5
; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3
; X32-SSE41-NEXT: movdqa %xmm5, %xmm0
; X32-SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -367,7 +367,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSE41-NEXT: pmovsxbd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxbd %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxbd %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovsxbd %xmm0, %xmm3
@@ -380,9 +380,9 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -391,7 +391,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX2-LABEL: sext_16i8_to_16i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -421,7 +421,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3
@@ -736,7 +736,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-LABEL: sext_8i16_to_8i32:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -744,7 +744,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-LABEL: sext_8i16_to_8i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -771,7 +771,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; X32-SSE41-LABEL: sext_8i16_to_8i32:
; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
@@ -813,9 +813,9 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd %xmm0, %xmm5
; SSE41-NEXT: pmovsxwd %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -824,12 +824,12 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; AVX1-LABEL: sext_16i16_to_16i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -866,9 +866,9 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3
; X32-SSE41-NEXT: movdqa %xmm5, %xmm0
; X32-SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -1043,7 +1043,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: pmovsxwq %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwq %xmm2, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm0, %xmm3
@@ -1056,9 +1056,9 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -1067,7 +1067,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2-LABEL: sext_8i16_to_8i64:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -1101,7 +1101,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3
@@ -1160,7 +1160,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1171,7 +1171,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1180,7 +1180,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSE41-LABEL: sext_4i32_to_4i64:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -1188,7 +1188,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX1-LABEL: sext_4i32_to_4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -1208,7 +1208,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; X32-SSE2-NEXT: pxor %xmm2, %xmm2
; X32-SSE2-NEXT: pxor %xmm3, %xmm3
; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1217,7 +1217,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; X32-SSE41-LABEL: sext_4i32_to_4i64:
; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
@@ -1235,12 +1235,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -1254,12 +1254,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -1269,9 +1269,9 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq %xmm0, %xmm5
; SSE41-NEXT: pmovsxdq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -1280,12 +1280,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; AVX1-LABEL: sext_8i32_to_8i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -1312,12 +1312,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; X32-SSE2-NEXT: pxor %xmm5, %xmm5
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE2-NEXT: pxor %xmm3, %xmm3
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -1327,9 +1327,9 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5
; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3
; X32-SSE41-NEXT: movdqa %xmm5, %xmm0
; X32-SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -1849,7 +1849,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; AVX1-NEXT: negl %eax
; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -3046,7 +3046,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3058,7 +3058,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3094,7 +3094,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; X32-SSE2-NEXT: pxor %xmm2, %xmm2
; X32-SSE2-NEXT: pxor %xmm3, %xmm3
; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3166,7 +3166,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3179,7 +3179,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3190,7 +3190,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -3200,7 +3200,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -3226,7 +3226,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; X32-SSE2-NEXT: pxor %xmm2, %xmm2
; X32-SSE2-NEXT: pxor %xmm3, %xmm3
; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3237,7 +3237,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; X32-SSE41-NEXT: pslld $31, %xmm0
; X32-SSE41-NEXT: psrad $31, %xmm0
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
@@ -3737,7 +3737,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE41-NEXT: psrad $26, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; SSE41-NEXT: psllq $58, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm4
@@ -3767,9 +3767,9 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
@@ -3782,7 +3782,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
; AVX2-NEXT: vpsraw $10, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1
; AVX2-NEXT: retq
;
@@ -3861,7 +3861,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; X32-SSE41-NEXT: psrad $26, %xmm1
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; X32-SSE41-NEXT: psllq $58, %xmm2
; X32-SSE41-NEXT: movdqa %xmm2, %xmm4
@@ -3916,7 +3916,7 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -3962,7 +3962,7 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
; X32-SSE41-NEXT: pxor %xmm1, %xmm1
; X32-SSE41-NEXT: psubw %xmm0, %xmm1
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; X32-SSE41-NEXT: retl
%z = zext <8 x i8> %x to <8 x i16>
@@ -4002,7 +4002,7 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: retq
;
@@ -4049,7 +4049,7 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE41-NEXT: paddw %xmm0, %xmm1
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; X32-SSE41-NEXT: retl
%z = zext <8 x i8> %x to <8 x i16>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 5764d19f4c7f..9b1fb29cb029 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -23,7 +23,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE2-NEXT: psrlq %xmm4, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm3
@@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE41-NEXT: psrlq %xmm4, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -54,7 +54,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
@@ -99,7 +99,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm4, %xmm2
; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
@@ -123,7 +123,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrad %xmm3, %xmm4
@@ -139,7 +139,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrad %xmm4, %xmm5
@@ -205,7 +205,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psrad %xmm4, %xmm2
; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrad %xmm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 358f9b8cc4de..ae9c375eec25 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -22,7 +22,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
@@ -32,7 +32,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
@@ -92,7 +92,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; X32-AVX1-NEXT: # xmm3 = mem[0,0]
; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
-; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
@@ -102,7 +102,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
-; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index 11d118bf31c3..a994d6610d73 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -27,7 +27,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrad %xmm3, %xmm4
@@ -43,7 +43,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrad %xmm4, %xmm5
@@ -109,7 +109,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psrad %xmm4, %xmm2
; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrad %xmm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 1a2889ab861e..2e19f753722d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -22,7 +22,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: psrlq %xmm1, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: retq
@@ -31,7 +31,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: psrlq %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
@@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-LABEL: var_shift_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
@@ -75,7 +75,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: psrlq %xmm1, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: retl
@@ -93,7 +93,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm3, %xmm4
@@ -109,7 +109,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
@@ -175,7 +175,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psrld %xmm4, %xmm2
; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrld %xmm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 154c35b51db9..9119e32bda37 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -22,11 +22,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -69,11 +69,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
-; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
-; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index b540421fb3de..8a843ef652e7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -27,7 +27,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld %xmm4, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm3, %xmm4
@@ -43,7 +43,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
@@ -109,7 +109,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psrld %xmm4, %xmm2
; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrld %xmm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index eee456150476..5f1325aacb4d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -22,7 +22,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: psllq %xmm1, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: retq
@@ -31,7 +31,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: psllq %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
@@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-LABEL: var_shift_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
@@ -73,7 +73,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-SSE-NEXT: psllq %xmm1, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 56ebce709a8d..1296fcf8bb90 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -22,11 +22,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -66,11 +66,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
-; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
-; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 04064f1c1970..a11fc5b7c8ed 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1013,19 +1013,19 @@ define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_0c1d2e3f:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: retq
;
@@ -1055,19 +1055,19 @@ define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_48596a7b:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_48596a7b:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: retq
;
@@ -1513,7 +1513,7 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
@@ -1529,25 +1529,25 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
;
; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,0,1,2,3]
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7]
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 635d94b9e723..131c621162a5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3167,7 +3167,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z
; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -3191,7 +3191,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z
; XOPAVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -5064,7 +5064,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,u,u,u,u,u,u,u,u]
@@ -5090,7 +5090,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],xmm3[14,15],xmm1[u,u,u,u,u,u,u,u]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -5161,7 +5161,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,14,15]
@@ -5187,7 +5187,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,2,3,4,5],xmm2[6,7]
; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -5752,9 +5752,9 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -5776,9 +5776,9 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 04759bf104cb..0508c6bac2bb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4053,7 +4053,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -4083,7 +4083,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
; XOPAVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 973045696fbd..4798b4b1d38a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -3216,7 +3216,7 @@ entry:
define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) {
; AVX1-LABEL: broadcast_concat_crash:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index b2d4dc76a10e..4c8073614d6d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -434,7 +434,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_z
; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 3199cc0fa9b9..4237c4107d47 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -397,7 +397,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -410,7 +410,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 72c1aefb5da7..622eb0881052 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3228,7 +3228,7 @@ define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
index 25c44964a551..6adb6b0c2c0b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -372,14 +372,14 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
; BTVER1: # %bb.0:
; BTVER1-NEXT: psrld $16, %xmm1
-; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; BTVER1-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrld $16, %xmm1, %xmm1
-; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; BTVER2-NEXT: retq
%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 42bf2d9eac28..5d065213f925 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -57,7 +57,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSE41-LABEL: zext_16i8_to_16i16:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -113,9 +113,9 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -277,7 +277,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -290,9 +290,9 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -301,7 +301,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX2-LABEL: zext_16i8_to_16i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -515,7 +515,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-LABEL: zext_8i16_to_8i32:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -571,9 +571,9 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -727,7 +727,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -740,9 +740,9 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
@@ -751,7 +751,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2-LABEL: zext_8i16_to_8i64:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -813,7 +813,7 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSE41-LABEL: zext_4i32_to_4i64:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -869,9 +869,9 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
@@ -1885,7 +1885,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -1894,7 +1894,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -2213,14 +2213,14 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -2245,14 +2245,14 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vmovaps %ymm4, %ymm0
@@ -2263,9 +2263,9 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vmovdqa %ymm4, %ymm0
; AVX2-NEXT: retq
@@ -2499,7 +2499,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE41-NEXT: pand %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; SSE41-NEXT: pand %xmm4, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
@@ -2518,9 +2518,9 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
@@ -2532,7 +2532,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
index c809090c8955..89d1b549182a 100644
--- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll
+++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
@@ -80,7 +80,7 @@ define <16 x i16> @sgt_zero(<16 x i8>* %p, <16 x i16> %x, <16 x i16> %y) {
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 3cd5654771c5..24849c2b850c 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -42,7 +42,7 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
; AVX1: ## %bb.0: ## %bb
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq (%rdi,%rsi,8), %rax
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index 4c56c654defa..42fbdb186357 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -364,7 +364,7 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double>
; AVX1-LABEL: signbit_sel_v4f64_small_mask:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -397,7 +397,7 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double>
; XOP-LABEL: signbit_sel_v4f64_small_mask:
; XOP: # %bb.0:
; XOP-NEXT: vpmovsxdq %xmm2, %xmm3
-; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; XOP-NEXT: vpmovsxdq %xmm2, %xmm2
; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; XOP-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll
index 9362c41a3dc8..1131d07b15d3 100644
--- a/llvm/test/CodeGen/X86/vshift-4.ll
+++ b/llvm/test/CodeGen/X86/vshift-4.ll
@@ -32,7 +32,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movdqa %xmm0, %xmm2
; X32-NEXT: psllq %xmm1, %xmm2
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X32-NEXT: psllq %xmm1, %xmm0
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-NEXT: movapd %xmm0, (%eax)
@@ -42,7 +42,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: psllq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: psllq %xmm1, %xmm0
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X64-NEXT: movapd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll
index 9c567fc38415..d5022b24eabe 100644
--- a/llvm/test/CodeGen/X86/widen_conv-4.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-4.ll
@@ -28,7 +28,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
; X86-SSE42-LABEL: convert_v7i16_v7f32:
; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; X86-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1
; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -55,7 +55,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
;
; X64-SSE42-LABEL: convert_v7i16_v7f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; X64-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1
; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
diff --git a/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index 6f335c00b589..e258cdf0035a 100644
--- a/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -30,7 +30,7 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin
; CHECK: ## %bb.0:
; CHECK-NEXT: cmpeqps %xmm1, %xmm0
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: movaps %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index e836b00bb620..e1d6929503f1 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -407,7 +407,7 @@ define i32 @PR17487(i1 %tobool) {
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X32-NEXT: pandn {{\.LCPI.*}}, %xmm0
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X32-NEXT: movd %xmm0, %ecx
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl $1, %ecx
More information about the llvm-commits
mailing list