[llvm] 9ba577e - [X86][SSE] canonicalizeShuffleWithBinOps - handle target shuffles. NFCI.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 15 05:07:17 PDT 2021
Author: Simon Pilgrim
Date: 2021-03-15T11:59:25Z
New Revision: 9ba577eca2e339726bfaad4e615c6324a705b292
URL: https://github.com/llvm/llvm-project/commit/9ba577eca2e339726bfaad4e615c6324a705b292
DIFF: https://github.com/llvm/llvm-project/commit/9ba577eca2e339726bfaad4e615c6324a705b292.diff
LOG: [X86][SSE] canonicalizeShuffleWithBinOps - handle target shuffles. NFCI.
Fold SHUFFLE(BINOP(SHUFFLE(X),SHUFFLE(Y))) -> BINOP(SHUFFLE'(X),SHUFFLE'(Y)) style patterns as well as the existing shuffles of constants.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/haddsub-3.ll
llvm/test/CodeGen/X86/haddsub-shuf.ll
llvm/test/CodeGen/X86/haddsub-undef.ll
llvm/test/CodeGen/X86/horizontal-sum.ll
llvm/test/CodeGen/X86/known-signbits-vector.ll
llvm/test/CodeGen/X86/phaddsub.ll
llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a2eb28fc7f48..5a21982dea40 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36814,7 +36814,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
return SDValue();
}
-// Canonicalize SHUFFLE(BINOP(X,C)) -> BINOP(SHUFFLE(X),SHUFFLE(C)).
+// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
const SDLoc &DL) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -36822,11 +36822,14 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
auto IsMergeableWithShuffle = [](SDValue Op) {
// AllZeros/AllOnes constants are freely shuffled and will peek through
- // bitcasts. Other constant build vectors do not peek through bitcasts.
+ // bitcasts. Other constant build vectors do not peek through bitcasts. Only
+ // merge with target shuffles if it has one use so shuffle combining is
+ // likely to kick in.
return ISD::isBuildVectorAllOnes(Op.getNode()) ||
ISD::isBuildVectorAllZeros(Op.getNode()) ||
ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
- ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode());
+ ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
+ (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
};
auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
// Ensure we only shuffle whole vector src elements, unless its a logical
diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll
index 48d4fe556555..c83a7b73edf5 100644
--- a/llvm/test/CodeGen/X86/haddsub-3.ll
+++ b/llvm/test/CodeGen/X86/haddsub-3.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX1,AVX1-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
define float @pr26491(<4 x float> %a0) {
@@ -72,11 +72,11 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; SSE2-NEXT: subpd {{.*}}(%rip), %xmm2
; SSE2-NEXT: movapd %xmm2, %xmm3
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: addpd %xmm2, %xmm3
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0]
-; SSE2-NEXT: divpd %xmm3, %xmm1
-; SSE2-NEXT: divpd %xmm3, %xmm0
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE2-NEXT: addpd %xmm3, %xmm2
+; SSE2-NEXT: divpd %xmm2, %xmm1
+; SSE2-NEXT: divpd %xmm2, %xmm0
; SSE2-NEXT: xorpd %xmm2, %xmm2
; SSE2-NEXT: addpd %xmm2, %xmm0
; SSE2-NEXT: addpd %xmm2, %xmm1
@@ -87,10 +87,9 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
; SSSE3-SLOW-NEXT: movq %rdi, %xmm2
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2
-; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3
-; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0]
+; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm3 = xmm2[0,0]
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm2
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0
; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2
@@ -111,31 +110,17 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
; SSSE3-FAST-NEXT: retq
;
-; AVX1-SLOW-LABEL: PR41414:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1
-; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0
-; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: PR41414:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vmovq %rdi, %xmm1
-; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0
-; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-FAST-NEXT: retq
+; AVX1-LABEL: PR41414:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq %rdi, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: PR41414:
; AVX2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index 282ef37f6e52..22007df8320a 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -364,29 +364,10 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
; SSSE3_FAST-NEXT: retq
;
-; AVX1_SLOW-LABEL: hadd_v4f64:
-; AVX1_SLOW: # %bb.0:
-; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1_SLOW-NEXT: retq
-;
-; AVX1_FAST-LABEL: hadd_v4f64:
-; AVX1_FAST: # %bb.0:
-; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
-; AVX1_FAST-NEXT: retq
-;
-; AVX2_SLOW-LABEL: hadd_v4f64:
-; AVX2_SLOW: # %bb.0:
-; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX2_SLOW-NEXT: retq
-;
-; AVX2_FAST-LABEL: hadd_v4f64:
-; AVX2_FAST: # %bb.0:
-; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
-; AVX2_FAST-NEXT: retq
+; AVX-LABEL: hadd_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
%hop = fadd <4 x double> %a0, %a1
@@ -457,29 +438,10 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1
; SSSE3_FAST-NEXT: retq
;
-; AVX1_SLOW-LABEL: hsub_v4f64:
-; AVX1_SLOW: # %bb.0:
-; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
-; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1_SLOW-NEXT: retq
-;
-; AVX1_FAST-LABEL: hsub_v4f64:
-; AVX1_FAST: # %bb.0:
-; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
-; AVX1_FAST-NEXT: retq
-;
-; AVX2_SLOW-LABEL: hsub_v4f64:
-; AVX2_SLOW: # %bb.0:
-; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
-; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
-; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX2_SLOW-NEXT: retq
-;
-; AVX2_FAST-LABEL: hsub_v4f64:
-; AVX2_FAST: # %bb.0:
-; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
-; AVX2_FAST-NEXT: retq
+; AVX-LABEL: hsub_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
%hop = fsub <4 x double> %a0, %a1
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index 68d058433179..8a5e1cd66364 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -470,9 +470,8 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
; SSE-SLOW-LABEL: add_pd_010:
; SSE-SLOW: # %bb.0:
; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
-; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: add_pd_010:
@@ -601,10 +600,10 @@ define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
define <4 x float> @add_ps_017(<4 x float> %x) {
; SSE-SLOW-LABEL: add_ps_017:
; SSE-SLOW: # %bb.0:
-; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: add_ps_017:
@@ -926,10 +925,10 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
; SSE-SLOW-LABEL: PR45747_1:
; SSE-SLOW: # %bb.0:
-; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2,2,2]
-; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: PR45747_1:
@@ -957,9 +956,10 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
; SSE-SLOW-LABEL: PR45747_2:
; SSE-SLOW: # %bb.0:
-; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-SLOW-NEXT: addps %xmm1, %xmm0
-; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: PR45747_2:
@@ -1009,14 +1009,14 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23:
; SSE-SLOW: # %bb.0:
-; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
+; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
; SSE-SLOW-NEXT: addps %xmm2, %xmm0
-; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
-; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE-SLOW-NEXT: addps %xmm1, %xmm3
-; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[0,3]
-; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: PR34724_add_v4f32_0u23:
@@ -1026,14 +1026,9 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
;
; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm2
-; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2]
+; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: PR34724_add_v4f32_0u23:
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 89434cc4650d..156a423970bc 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -19,21 +19,20 @@
define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
-; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
-; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
-; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
@@ -45,17 +44,11 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
;
; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
+; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -71,17 +64,11 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
;
; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
-; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
+; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -118,21 +105,19 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
-; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4
-; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
-; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
@@ -144,21 +129,18 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
;
; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-SLOW-NEXT: retq
;
@@ -171,21 +153,20 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
;
; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
%6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
@@ -218,27 +199,24 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm8
-; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
-; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
-; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
-; SSSE3-SLOW-NEXT: haddps %xmm8, %xmm1
-; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm8
+; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm1
+; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm5[3,1]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm8
-; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[3,1]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
+; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm7
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2]
-; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[0,2]
+; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -262,13 +240,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
;
; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8
-; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3]
+; AVX1-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm2
; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3
@@ -314,13 +290,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
;
; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8
-; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8
-; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3]
+; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
@@ -411,29 +385,25 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm8
-; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm0
-; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
-; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm8
+; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
-; SSSE3-SLOW-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
+; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm1
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[2,0]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm8
-; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[2,0]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
+; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm7
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2]
-; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
+; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
@@ -460,13 +430,10 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
;
; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8
-; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1]
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm3
@@ -518,13 +485,10 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
;
; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
-; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8
-; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1]
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
@@ -627,77 +591,67 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
+; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
-; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-SLOW-NEXT: addps %xmm5, %xmm0
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
-; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm2
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
+; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
+; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
+; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
+; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm3
-; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
+; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
-; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-FAST-NEXT: addps %xmm5, %xmm0
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3]
-; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSSE3-FAST-NEXT: addps %xmm4, %xmm2
-; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
+; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
+; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1]
+; SSSE3-FAST-NEXT: addps %xmm4, %xmm5
+; SSSE3-FAST-NEXT: addps %xmm5, %xmm1
+; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0
+; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSSE3-FAST-NEXT: addps %xmm0, %xmm2
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-FAST-NEXT: addps %xmm4, %xmm3
-; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSSE3-FAST-NEXT: addps %xmm2, %xmm3
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-SLOW-NEXT: vaddps %xmm4, %xmm5, %xmm4
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
+; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm4, %xmm1
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
+; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm1, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
@@ -710,18 +664,15 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-FAST-NEXT: vaddps %xmm4, %xmm5, %xmm4
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
-; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
+; AVX-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm4, %xmm1
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
+; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
@@ -765,18 +716,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm4
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm2
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
+; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
+; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm4
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm3
+; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3
-; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
; SSSE3-SLOW-NEXT: retq
;
@@ -789,17 +740,17 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
-; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm2
-; SSSE3-FAST-NEXT: paddd %xmm2, %xmm4
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
+; SSSE3-FAST-NEXT: paddd %xmm2, %xmm4
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3
; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm2
-; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSSE3-FAST-NEXT: retq
;
@@ -811,22 +762,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0
-; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
+; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
+; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-SLOW-NEXT: retq
;
@@ -838,20 +787,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0
-; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1
-; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
+; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-FAST-NEXT: retq
;
@@ -863,22 +810,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0
-; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
+; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
+; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: retq
;
@@ -890,20 +835,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0
-; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1
-; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
+; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
+; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
@@ -1081,28 +1024,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
-; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
-; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
+; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
-; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
-; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
+; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3
+; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4
+; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
@@ -1129,23 +1072,19 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
+; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
+; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
-; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2
-; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
@@ -1179,22 +1118,23 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
+; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
+; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
@@ -1217,22 +1157,22 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; AVX-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index a70b25a830ce..18cd42c8c1de 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -511,39 +511,21 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: signbits_mask_ashr_smax:
; X86: # %bb.0:
-; X86-NEXT: vpsrad $26, %xmm0, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm0, %xmm3
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X86-NEXT: vpsrad $26, %xmm1, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm1, %xmm3
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-AVX1-LABEL: signbits_mask_ashr_smax:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
@@ -569,39 +551,21 @@ declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: signbits_mask_ashr_smin:
; X86: # %bb.0:
-; X86-NEXT: vpsrad $26, %xmm0, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm0, %xmm3
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X86-NEXT: vpsrad $26, %xmm1, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm1, %xmm3
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-AVX1-LABEL: signbits_mask_ashr_smin:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
@@ -627,39 +591,21 @@ declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: signbits_mask_ashr_umax:
; X86: # %bb.0:
-; X86-NEXT: vpsrad $26, %xmm0, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm0, %xmm3
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X86-NEXT: vpsrad $26, %xmm1, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm1, %xmm3
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-AVX1-LABEL: signbits_mask_ashr_umax:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
@@ -685,39 +631,21 @@ declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: signbits_mask_ashr_umin:
; X86: # %bb.0:
-; X86-NEXT: vpsrad $26, %xmm0, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm0, %xmm3
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X86-NEXT: vpsrad $26, %xmm1, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X86-NEXT: vpsrad $27, %xmm1, %xmm3
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-AVX1-LABEL: signbits_mask_ashr_umin:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll
index ee27ac12739a..d999f5089a21 100644
--- a/llvm/test/CodeGen/X86/phaddsub.ll
+++ b/llvm/test/CodeGen/X86/phaddsub.ll
@@ -412,8 +412,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
; SSSE3-SLOW-LABEL: phaddd_single_source5:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
-; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: phaddd_single_source5:
@@ -425,8 +425,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
; AVX-SLOW-LABEL: phaddd_single_source5:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: phaddd_single_source5:
@@ -438,8 +438,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
; AVX2-SHUF-LABEL: phaddd_single_source5:
; AVX2-SHUF: # %bb.0:
; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
-; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-SHUF-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%add = add <4 x i32> %l, %x
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index 3cd9c117c620..abb310d3a518 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -185,7 +185,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: leaq (%rax,%rax,8), %rax
; SSE2-NEXT: subq %rax, %rsi
-; SSE2-NEXT: movq %rsi, %xmm0
+; SSE2-NEXT: movq %rsi, %xmm1
; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: imulq %r8
; SSE2-NEXT: movq %rdx, %rax
@@ -193,10 +193,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: leaq (%rax,%rax,8), %rax
; SSE2-NEXT: subq %rax, %rdi
-; SSE2-NEXT: movq %rdi, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movq %rdi, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: imulq %rdx
@@ -208,14 +208,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax
; SSE2-NEXT: addq %rcx, %rax
; SSE2-NEXT: movq %rax, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
+; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 83db253d122d..7cf566f7b3a1 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -853,13 +853,13 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3]
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3]
+; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 69d66ebcdb69..dde6832d6482 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -899,13 +899,13 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
; SSE-NEXT: psubq %xmm1, %xmm0
; SSE-NEXT: pxor %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3]
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3]
+; SSE-NEXT: por %xmm4, %xmm1
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
index 6adb6b0c2c0b..d62462c4e59a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -363,8 +363,8 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
; AMD10H: # %bb.0:
; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0
-; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AMD10H-NEXT: andps {{.*}}(%rip), %xmm0
; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AMD10H-NEXT: packuswb %xmm0, %xmm0
; AMD10H-NEXT: retq
More information about the llvm-commits
mailing list