[llvm] r339335 - [X86][SSE] Combine (some) target shuffles with multiple uses
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 9 05:30:02 PDT 2018
Author: rksimon
Date: Thu Aug 9 05:30:02 2018
New Revision: 339335
URL: http://llvm.org/viewvc/llvm-project?rev=339335&view=rev
Log:
[X86][SSE] Combine (some) target shuffles with multiple uses
As discussed on D41794, we have many cases where we fail to combine shuffles as the input operands have other uses.
This patch permits these shuffles to be combined as long as they don't introduce additional variable shuffle masks, which should reduce instruction dependencies and allow the total number of shuffles to still drop without increasing the constant pool.
However, this may mean that some memory folds may no longer occur, and on pre-AVX require the occasional extra register move.
This also exposes some poor PMULDQ/PMULUDQ codegen which was doing unnecessary upper/lower calculations which will in fact fold to zero/undef - the fix will be added in a followup commit.
Differential Revision: https://reviews.llvm.org/D50328
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/2012-01-12-extract-sv.ll
llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-128.ll
llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
llvm/trunk/test/CodeGen/X86/combine-shl.ll
llvm/trunk/test/CodeGen/X86/extractelement-load.ll
llvm/trunk/test/CodeGen/X86/madd.ll
llvm/trunk/test/CodeGen/X86/mmx-arith.ll
llvm/trunk/test/CodeGen/X86/oddshuffles.ll
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/pr29112.ll
llvm/trunk/test/CodeGen/X86/pr34592.ll
llvm/trunk/test/CodeGen/X86/sdiv-exact.ll
llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/vec_insert-3.ll
llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
llvm/trunk/test/CodeGen/X86/vector-sext.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Aug 9 05:30:02 2018
@@ -29652,7 +29652,8 @@ static bool matchBinaryPermuteVectorShuf
/// instruction but should only be used to replace chains over a certain depth.
static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> BaseMask, int Depth,
- bool HasVariableMask, SelectionDAG &DAG,
+ bool HasVariableMask,
+ bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
@@ -29865,7 +29866,7 @@ static SDValue combineX86ShuffleChain(Ar
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
- bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
+ AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
bool MaskContainsZeros =
any_of(Mask, [](int M) { return M == SM_SentinelZero; });
@@ -30199,7 +30200,8 @@ static SDValue combineX86ShufflesConstan
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
- bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
@@ -30354,18 +30356,23 @@ static SDValue combineX86ShufflesRecursi
CombinedNodes.push_back(Op.getNode());
// See if we can recurse into each shuffle source op (if it's a target
- // shuffle). The source op should only be combined if it either has a
- // single use (i.e. current Op) or all its users have already been combined.
+ // shuffle). The source op should only be generally combined if it either has
+ // a single use (i.e. current Op) or all its users have already been combined,
+ // if not then we can still combine but should prevent generation of variable
+ // shuffles to avoid constant pool bloat.
// Don't recurse if we already have more source ops than we can combine in
// the remaining recursion depth.
if (Ops.size() < (MaxRecursionDepth - Depth)) {
- for (int i = 0, e = Ops.size(); i < e; ++i)
+ for (int i = 0, e = Ops.size(); i < e; ++i) {
+ bool AllowVar = false;
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
- if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
- DAG, Subtarget))
- return Res;
+ AllowVar = AllowVariableMask;
+ if (SDValue Res = combineX86ShufflesRecursively(
+ Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
+ AllowVar, DAG, Subtarget))
+ return Res;
+ }
}
// Attempt to constant fold all of the constant source ops.
@@ -30395,8 +30402,8 @@ static SDValue combineX86ShufflesRecursi
}
// Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
- Subtarget);
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
}
/// Get the PSHUF-style mask from PSHUF node.
@@ -30697,7 +30704,7 @@ static SDValue combineTargetShuffle(SDVa
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
}
@@ -31316,7 +31323,7 @@ static SDValue combineShuffle(SDNode *N,
// a particular chain.
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
}
@@ -34223,7 +34230,8 @@ static SDValue combineVectorPack(SDNode
SDValue Op(N, 0);
if (SDValue Res =
combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
return SDValue();
@@ -34283,7 +34291,7 @@ static SDValue combineVectorShiftImm(SDN
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
}
@@ -34322,7 +34330,8 @@ static SDValue combineVectorInsert(SDNod
SDValue Op(N, 0);
if (SDValue Res =
combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
return SDValue();
@@ -34848,7 +34857,7 @@ static SDValue combineAnd(SDNode *N, Sel
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
}
@@ -34885,7 +34894,7 @@ static SDValue combineAnd(SDNode *N, Sel
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
}
@@ -37419,7 +37428,7 @@ static SDValue combineAndnp(SDNode *N, S
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, DAG, Subtarget))
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
}
Modified: llvm/trunk/test/CodeGen/X86/2012-01-12-extract-sv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2012-01-12-extract-sv.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2012-01-12-extract-sv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2012-01-12-extract-sv.ll Thu Aug 9 05:30:02 2018
@@ -6,10 +6,10 @@ define void @endless_loop() {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovaps (%eax), %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll Thu Aug 9 05:30:02 2018
@@ -1823,13 +1823,11 @@ declare <16 x i16> @llvm.x86.avx2.mpsadb
define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_mm256_mul_epi32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0
-; CHECK-NEXT: vpsrad $31, %ymm0, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2
+; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; CHECK-NEXT: vpsllq $32, %ymm1, %ymm1
-; CHECK-NEXT: vpsrad $31, %ymm1, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2
+; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll Thu Aug 9 05:30:02 2018
@@ -6491,11 +6491,11 @@ define i64 @test_mm512_reduce_mul_epi64(
; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
-; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
-; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
-; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
-; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
+; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
+; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; X86-NEXT: vpsllq $32, %xmm2, %xmm2
; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
@@ -6525,11 +6525,11 @@ define i64 @test_mm512_reduce_mul_epi64(
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
-; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
-; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
-; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
-; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
+; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
+; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; X64-NEXT: vpsllq $32, %xmm2, %xmm2
; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
@@ -6696,11 +6696,11 @@ define i64 @test_mm512_mask_reduce_mul_e
; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
-; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
-; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
-; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
-; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
+; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
+; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; X86-NEXT: vpsllq $32, %xmm2, %xmm2
; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
@@ -6733,11 +6733,11 @@ define i64 @test_mm512_mask_reduce_mul_e
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
-; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2
-; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
-; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3
-; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
+; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
+; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; X64-NEXT: vpsllq $32, %xmm2, %xmm2
; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-128.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-128.ll Thu Aug 9 05:30:02 2018
@@ -539,22 +539,18 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32>
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -564,22 +560,18 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32>
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT: vpsllq $32, %xmm3, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm3
+; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm3
+; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll Thu Aug 9 05:30:02 2018
@@ -360,13 +360,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32>
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
@@ -375,13 +373,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32>
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
Modified: llvm/trunk/test/CodeGen/X86/combine-shl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-shl.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-shl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-shl.ll Thu Aug 9 05:30:02 2018
@@ -406,21 +406,19 @@ define <4 x i32> @combine_vec_shl_ge_ash
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $5, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $4, %xmm3
; SSE2-NEXT: psrad $3, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1:
@@ -472,21 +470,19 @@ define <4 x i32> @combine_vec_shl_lt_ash
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $7, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $6, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $6, %xmm3
; SSE2-NEXT: psrad $5, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1:
@@ -541,21 +537,19 @@ define <4 x i32> @combine_vec_shl_gt_lsh
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld $5, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld $4, %xmm3
; SSE2-NEXT: psrld $3, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_gt_lshr1:
@@ -610,21 +604,19 @@ define <4 x i32> @combine_vec_shl_le_lsh
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld $7, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $6, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld $6, %xmm3
; SSE2-NEXT: psrld $5, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256]
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_le_lshr1:
Modified: llvm/trunk/test/CodeGen/X86/extractelement-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-load.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-load.ll Thu Aug 9 05:30:02 2018
@@ -85,10 +85,9 @@ define i64 @t4(<2 x double>* %a) {
; X32-SSE2-LABEL: t4:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X32-SSE2-NEXT: movd %xmm1, %eax
-; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE2-NEXT: movdqa (%eax), %xmm0
+; X32-SSE2-NEXT: movd %xmm0, %eax
+; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X32-SSE2-NEXT: movd %xmm0, %edx
; X32-SSE2-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/madd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/madd.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/madd.ll (original)
+++ llvm/trunk/test/CodeGen/X86/madd.ll Thu Aug 9 05:30:02 2018
@@ -2077,26 +2077,20 @@ define <4 x i32> @pmaddwd_negative2(<8 x
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm6
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2]
; SSE2-NEXT: pmuludq %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,4294934528,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pmuludq %xmm5, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
+; SSE2-NEXT: paddd %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: pmaddwd_negative2:
Modified: llvm/trunk/test/CodeGen/X86/mmx-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mmx-arith.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mmx-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mmx-arith.ll Thu Aug 9 05:30:02 2018
@@ -213,12 +213,11 @@ define void @test1(x86_mmx* %A, x86_mmx*
; X32-NEXT: movq %xmm0, (%eax)
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT: movdqa %xmm1, %xmm2
-; X32-NEXT: psrlq $32, %xmm2
-; X32-NEXT: pmuludq %xmm0, %xmm2
-; X32-NEXT: movdqa %xmm0, %xmm3
+; X32-NEXT: pxor %xmm2, %xmm2
+; X32-NEXT: pmuludq %xmm1, %xmm2
+; X32-NEXT: movdqa %xmm1, %xmm3
; X32-NEXT: psrlq $32, %xmm3
-; X32-NEXT: pmuludq %xmm1, %xmm3
+; X32-NEXT: pmuludq %xmm0, %xmm3
; X32-NEXT: paddq %xmm2, %xmm3
; X32-NEXT: psllq $32, %xmm3
; X32-NEXT: pmuludq %xmm1, %xmm0
@@ -254,12 +253,11 @@ define void @test1(x86_mmx* %A, x86_mmx*
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: psrlq $32, %xmm2
-; X64-NEXT: pmuludq %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm1, %xmm3
; X64-NEXT: psrlq $32, %xmm3
-; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pmuludq %xmm0, %xmm3
; X64-NEXT: paddq %xmm2, %xmm3
; X64-NEXT: psllq $32, %xmm3
; X64-NEXT: pmuludq %xmm0, %xmm1
Modified: llvm/trunk/test/CodeGen/X86/oddshuffles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddshuffles.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddshuffles.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll Thu Aug 9 05:30:02 2018
@@ -563,24 +563,22 @@ define void @v12i16(<8 x i16> %a, <8 x i
define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; SSE2-LABEL: v12i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,2]
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1]
+; SSE2-NEXT: movaps %xmm2, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,2]
; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT: movaps %xmm0, 32(%rdi)
-; SSE2-NEXT: movaps %xmm4, 16(%rdi)
-; SSE2-NEXT: movaps %xmm3, (%rdi)
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,2]
+; SSE2-NEXT: movaps %xmm2, 32(%rdi)
+; SSE2-NEXT: movaps %xmm5, 16(%rdi)
+; SSE2-NEXT: movaps %xmm4, (%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v12i32:
@@ -877,35 +875,33 @@ define void @interleave_24i8_in(<24 x i8
; SSE2-LABEL: interleave_24i8_in:
; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
-; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,3,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
+; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5]
-; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5]
+; SSE2-NEXT: packuswb %xmm5, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
-; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6]
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; SSE2-NEXT: packuswb %xmm0, %xmm1
@@ -993,11 +989,11 @@ define void @interleave_24i16_out(<24 x
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm5
@@ -1277,44 +1273,48 @@ define void @interleave_24i16_in(<24 x i
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
-; SSE2-NEXT: movups 80(%rdi), %xmm9
-; SSE2-NEXT: movups 64(%rdi), %xmm10
+; SSE2-NEXT: movups 80(%rdi), %xmm8
+; SSE2-NEXT: movups 64(%rdi), %xmm11
; SSE2-NEXT: movups (%rdi), %xmm0
-; SSE2-NEXT: movups 16(%rdi), %xmm11
-; SSE2-NEXT: movups 32(%rdi), %xmm8
-; SSE2-NEXT: movups 48(%rdi), %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
-; SSE2-NEXT: movaps %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE2-NEXT: movups 16(%rdi), %xmm10
+; SSE2-NEXT: movups 32(%rdi), %xmm9
+; SSE2-NEXT: movdqu 48(%rdi), %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm6
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm10[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE2-NEXT: movaps %xmm9, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
-; SSE2-NEXT: movaps %xmm0, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT: movaps %xmm8, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0]
-; SSE2-NEXT: movups %xmm3, 16(%rsi)
-; SSE2-NEXT: movups %xmm5, (%rsi)
-; SSE2-NEXT: movups %xmm2, 16(%rdx)
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[2,0]
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm9[2,0]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm11[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT: movaps %xmm8, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm11[0,0]
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[2,0]
+; SSE2-NEXT: movups %xmm6, 16(%rsi)
+; SSE2-NEXT: movups %xmm3, (%rsi)
+; SSE2-NEXT: movups %xmm4, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movups %xmm7, 16(%rcx)
-; SSE2-NEXT: movups %xmm1, (%rcx)
+; SSE2-NEXT: movups %xmm2, 16(%rcx)
+; SSE2-NEXT: movups %xmm7, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
@@ -1507,48 +1507,44 @@ define void @interleave_24i32_out(<24 x
define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_in:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqu (%rsi), %xmm5
-; SSE2-NEXT: movdqu 16(%rsi), %xmm2
-; SSE2-NEXT: movdqu (%rdx), %xmm6
-; SSE2-NEXT: movdqu 16(%rdx), %xmm1
-; SSE2-NEXT: movups (%rcx), %xmm7
+; SSE2-NEXT: movups (%rsi), %xmm5
+; SSE2-NEXT: movups 16(%rsi), %xmm8
+; SSE2-NEXT: movups (%rdx), %xmm6
+; SSE2-NEXT: movups 16(%rdx), %xmm3
+; SSE2-NEXT: movups (%rcx), %xmm0
; SSE2-NEXT: movups 16(%rcx), %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
-; SSE2-NEXT: movaps %xmm7, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2]
-; SSE2-NEXT: movaps %xmm7, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[1,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,2]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm5[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
-; SSE2-NEXT: movaps %xmm4, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm6[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2]
+; SSE2-NEXT: movaps %xmm0, %xmm7
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[1,0]
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2]
+; SSE2-NEXT: movaps %xmm5, %xmm7
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,1]
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm0[3,2]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,2]
+; SSE2-NEXT: movaps %xmm4, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[1,0]
+; SSE2-NEXT: movaps %xmm8, %xmm6
+; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2]
+; SSE2-NEXT: movaps %xmm8, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1]
; SSE2-NEXT: movaps %xmm4, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,2]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[0,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
-; SSE2-NEXT: movups %xmm2, 80(%rdi)
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2],xmm4[3,2]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,2]
+; SSE2-NEXT: movups %xmm4, 80(%rdi)
; SSE2-NEXT: movups %xmm7, 64(%rdi)
; SSE2-NEXT: movups %xmm6, 48(%rdi)
-; SSE2-NEXT: movups %xmm5, 32(%rdi)
-; SSE2-NEXT: movups %xmm3, 16(%rdi)
-; SSE2-NEXT: movups %xmm0, (%rdi)
+; SSE2-NEXT: movups %xmm0, 32(%rdi)
+; SSE2-NEXT: movups %xmm2, 16(%rdi)
+; SSE2-NEXT: movups %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_in:
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Thu Aug 9 05:30:02 2018
@@ -1318,83 +1318,76 @@ entry:
define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: psrad $16, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm8, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: psrad $16, %xmm8
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
+; SSE2-NEXT: pmuludq %xmm5, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; SSE2-NEXT: pmuludq %xmm0, %xmm4
+; SSE2-NEXT: paddq %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pmuludq %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: paddq %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
+; SSE2-NEXT: pmuludq %xmm1, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; SSE2-NEXT: pmuludq %xmm3, %xmm4
+; SSE2-NEXT: paddq %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm6, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: psrlq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: psrlq $32, %xmm6
-; SSE2-NEXT: pmuludq %xmm4, %xmm6
-; SSE2-NEXT: paddq %xmm5, %xmm6
-; SSE2-NEXT: psllq $32, %xmm6
-; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: paddq %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: psrlq $32, %xmm4
-; SSE2-NEXT: pmuludq %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrlq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm7, %xmm5
-; SSE2-NEXT: paddq %xmm4, %xmm5
-; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm7, %xmm1
-; SSE2-NEXT: paddq %xmm5, %xmm1
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: paddq %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlq $32, %xmm4
-; SSE2-NEXT: pmuludq %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: psrlq $32, %xmm5
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm2, %xmm5
-; SSE2-NEXT: paddq %xmm4, %xmm5
-; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm9, %xmm2
-; SSE2-NEXT: paddq %xmm5, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; SSE2-NEXT: pmuludq %xmm6, %xmm4
+; SSE2-NEXT: paddq %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pmuludq %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: psrad $16, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: paddq %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrlq $32, %xmm4
-; SSE2-NEXT: pmuludq %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm5
-; SSE2-NEXT: psrlq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm3, %xmm5
-; SSE2-NEXT: paddq %xmm4, %xmm5
-; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm8, %xmm3
-; SSE2-NEXT: paddq %xmm5, %xmm3
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; SSE2-NEXT: pmuludq %xmm3, %xmm6
+; SSE2-NEXT: pmuludq %xmm5, %xmm4
+; SSE2-NEXT: paddq %xmm6, %xmm4
+; SSE2-NEXT: pmuludq %xmm5, %xmm3
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: paddq %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_sext:
Modified: llvm/trunk/test/CodeGen/X86/pr29112.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr29112.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr29112.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr29112.ll Thu Aug 9 05:30:02 2018
@@ -8,63 +8,61 @@ declare <4 x float> @foo(<4 x float>, <4
define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
; CHECK-LABEL: bar:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 96
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: subq $72, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: vmovaps %xmm1, %xmm8
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm8
-; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1],xmm2[1],xmm9[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0,1,2],xmm3[1]
-; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0],xmm1[1],xmm8[2,3]
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3]
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3]
+; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1]
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
-; CHECK-NEXT: vmovshdup {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm3[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm3[3]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm3[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm1[1,0]
-; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm0
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
-; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm13[0],xmm8[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3],xmm9[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
-; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2
-; CHECK-NEXT: vmovaps %xmm15, %xmm1
-; CHECK-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm9
-; CHECK-NEXT: vaddps %xmm15, %xmm15, %xmm8
-; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0
+; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm0
-; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
+; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; CHECK-NEXT: vmovaps %xmm8, %xmm3
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
Modified: llvm/trunk/test/CodeGen/X86/pr34592.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr34592.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr34592.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr34592.ll Thu Aug 9 05:30:02 2018
@@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
-; CHECK-NEXT: subq $320, %rsp # imm = 0x140
+; CHECK-NEXT: subq $352, %rsp # imm = 0x160
; CHECK-NEXT: vmovaps 240(%rbp), %ymm8
; CHECK-NEXT: vmovaps 208(%rbp), %ymm9
; CHECK-NEXT: vmovaps 176(%rbp), %ymm10
@@ -21,21 +21,23 @@ define <16 x i64> @pluto(<16 x i64> %arg
; CHECK-NEXT: vmovaps 16(%rbp), %ymm15
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6
-; CHECK-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1],ymm8[2,3,4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2,3,4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3,4,5,6,7]
; CHECK-NEXT: # kill: def $xmm9 killed $xmm9 killed $ymm9
; CHECK-NEXT: vmovdqa %xmm9, %xmm11
; CHECK-NEXT: # kill: def $ymm11 killed $xmm11
; CHECK-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,0]
-; CHECK-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: # implicit-def: $ymm0
; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
; CHECK-NEXT: vmovaps %xmm2, %xmm9
; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
; CHECK-NEXT: vmovaps %xmm7, %xmm9
@@ -49,15 +51,15 @@ define <16 x i64> @pluto(<16 x i64> %arg
; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3]
; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
-; CHECK-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm5, %ymm1
-; CHECK-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm6, %ymm3
-; CHECK-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm14, (%rsp) # 32-byte Spill
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
Modified: llvm/trunk/test/CodeGen/X86/sdiv-exact.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sdiv-exact.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sdiv-exact.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sdiv-exact.ll Thu Aug 9 05:30:02 2018
@@ -227,13 +227,14 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
; X86-NEXT: movd %eax, %xmm1
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; X86-NEXT: movd %xmm2, %eax
+; X86-NEXT: movaps %xmm0, %xmm2
+; X86-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X86-NEXT: movd %xmm1, %eax
; X86-NEXT: sarl $3, %eax
; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
; X86-NEXT: movd %eax, %xmm1
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X86-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shrink_vmul.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll Thu Aug 9 05:30:02 2018
@@ -1235,17 +1235,14 @@ define void @mul_2xi16_sext_zext(i8* noc
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: psrlq $32, %xmm2
+; X86-SSE-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
-; X86-SSE-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE-NEXT: psrlq $32, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE-NEXT: paddq %xmm2, %xmm3
-; X86-SSE-NEXT: psllq $32, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT: paddq %xmm3, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT: paddq %xmm1, %xmm2
+; X86-SSE-NEXT: psllq $32, %xmm2
+; X86-SSE-NEXT: paddq %xmm3, %xmm2
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: .cfi_def_cfa_offset 4
@@ -1282,17 +1279,14 @@ define void @mul_2xi16_sext_zext(i8* noc
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-SSE-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE-NEXT: psrlq $32, %xmm2
+; X64-SSE-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm3
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT: movdqa %xmm0, %xmm3
-; X64-SSE-NEXT: psrlq $32, %xmm3
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE-NEXT: paddq %xmm2, %xmm3
-; X64-SSE-NEXT: psllq $32, %xmm3
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT: paddq %xmm3, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT: paddq %xmm1, %xmm2
+; X64-SSE-NEXT: psllq $32, %xmm2
+; X64-SSE-NEXT: paddq %xmm3, %xmm2
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
@@ -2059,14 +2053,12 @@ define void @mul_2xi16_varconst3(i8* noc
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0]
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE-NEXT: psrlq $32, %xmm0
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: psllq $32, %xmm0
-; X86-SSE-NEXT: paddq %xmm2, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65536,0]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT: psllq $32, %xmm1
+; X86-SSE-NEXT: paddq %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -2091,15 +2083,13 @@ define void @mul_2xi16_varconst3(i8* noc
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
-; X64-SSE-NEXT: movq %rcx, %xmm1
-; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE-NEXT: movq %rcx, %xmm2
+; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT: psrlq $32, %xmm0
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: psllq $32, %xmm0
-; X64-SSE-NEXT: paddq %xmm2, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: psllq $32, %xmm2
+; X64-SSE-NEXT: paddq %xmm0, %xmm2
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
@@ -2145,13 +2135,12 @@ define void @mul_2xi16_varconst4(i8* noc
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0]
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE-NEXT: psrlq $32, %xmm0
; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: psllq $32, %xmm0
-; X86-SSE-NEXT: paddq %xmm2, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
+; X86-SSE-NEXT: psllq $32, %xmm2
+; X86-SSE-NEXT: paddq %xmm0, %xmm2
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -2176,13 +2165,12 @@ define void @mul_2xi16_varconst4(i8* noc
; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-SSE-NEXT: movq %rcx, %xmm1
; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT: psrlq $32, %xmm0
; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: psllq $32, %xmm0
-; X64-SSE-NEXT: paddq %xmm2, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
+; X64-SSE-NEXT: psllq $32, %xmm2
+; X64-SSE-NEXT: paddq %xmm0, %xmm2
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-schedule.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-schedule.ll Thu Aug 9 05:30:02 2018
@@ -15003,141 +15003,131 @@ define <2 x double> @test_unpckhpd(<2 x
define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_unpcklpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; GENERIC-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
-; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; GENERIC-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; GENERIC-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_unpcklpd:
; ATOM: # %bb.0:
-; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; ATOM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50]
-; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
-; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00]
-; ATOM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [1:1.00]
+; ATOM-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm2, %xmm0 # sched: [6:3.00]
; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_unpcklpd:
; SLM: # %bb.0:
-; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50]
-; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
-; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50]
+; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [4:1.00]
+; SLM-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-SSE-LABEL: test_unpcklpd:
; SANDY-SSE: # %bb.0:
-; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
-; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; SANDY-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
+; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SANDY-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-SSE-NEXT: retq # sched: [1:1.00]
;
; SANDY-LABEL: test_unpcklpd:
; SANDY: # %bb.0:
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
-; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-SSE-LABEL: test_unpcklpd:
; HASWELL-SSE: # %bb.0:
-; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
-; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; HASWELL-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
+; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; HASWELL-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-SSE-NEXT: retq # sched: [7:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
-; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; BROADWELL-SSE-LABEL: test_unpcklpd:
; BROADWELL-SSE: # %bb.0:
-; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; BROADWELL-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
-; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
-; BROADWELL-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; BROADWELL-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; BROADWELL-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
+; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00]
+; BROADWELL-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; BROADWELL-SSE-NEXT: retq # sched: [7:1.00]
;
; BROADWELL-LABEL: test_unpcklpd:
; BROADWELL: # %bb.0:
-; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
-; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BROADWELL-NEXT: retq # sched: [7:1.00]
;
; SKYLAKE-SSE-LABEL: test_unpcklpd:
; SKYLAKE-SSE: # %bb.0:
-; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKYLAKE-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.33]
-; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; SKYLAKE-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [4:0.50]
-; SKYLAKE-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.33]
+; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SKYLAKE-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [4:0.50]
; SKYLAKE-SSE-NEXT: retq # sched: [7:1.00]
;
; SKYLAKE-LABEL: test_unpcklpd:
; SKYLAKE: # %bb.0:
-; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
-; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; SKX-SSE-LABEL: test_unpcklpd:
; SKX-SSE: # %bb.0:
-; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.33]
-; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; SKX-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [4:0.50]
-; SKX-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.33]
+; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [4:0.50]
; SKX-SSE-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: test_unpcklpd:
; SKX: # %bb.0:
-; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-SSE-LABEL: test_unpcklpd:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; BTVER2-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
-; BTVER2-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; BTVER2-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50]
+; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00]
+; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:0.50]
+; BTVER2-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:0.50]
+; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_unpcklpd:
; ZNVER1-SSE: # %bb.0:
-; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; ZNVER1-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.25]
-; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
-; ZNVER1-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; ZNVER1-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.25]
+; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [8:0.50]
+; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; ZNVER1-SSE-NEXT: retq # sched: [1:0.50]
;
; ZNVER1-LABEL: test_unpcklpd:
; ZNVER1: # %bb.0:
-; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50]
-; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
%2 = load <2 x double>, <2 x double> *%a2, align 16
Modified: llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll Thu Aug 9 05:30:02 2018
@@ -832,26 +832,25 @@ declare <8 x i16> @llvm.x86.sse41.mpsadb
define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: test_mm_mul_epi32:
; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE-NEXT: psrad $31, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE-NEXT: pmuldq %xmm1, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE-NEXT: pmuldq %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
; AVX1-LABEL: test_mm_mul_epi32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: ret{{[l|q]}}
Modified: llvm/trunk/test/CodeGen/X86/vec_insert-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_insert-3.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_insert-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_insert-3.ll Thu Aug 9 05:30:02 2018
@@ -7,9 +7,10 @@ define <2 x i64> @t1(i64 %s, <2 x i64> %
; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X32-NEXT: movaps %xmm0, %xmm2
+; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll Thu Aug 9 05:30:02 2018
@@ -537,30 +537,29 @@ entry:
define <3 x double> @constrained_vector_fma_v3f64() {
; NO-FMA-LABEL: constrained_vector_fma_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; NO-FMA-NEXT: callq fma
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; NO-FMA-NEXT: callq fma
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; NO-FMA-NEXT: callq fma
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -964,27 +963,26 @@ entry:
define <3 x double> @constrained_vector_pow_v3f64() {
; NO-FMA-LABEL: constrained_vector_pow_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq pow
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq pow
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; NO-FMA-NEXT: callq pow
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -1190,27 +1188,26 @@ entry:
define <3 x double> @constrained_vector_powi_v3f64() {
; NO-FMA-LABEL: constrained_vector_powi_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movl $3, %edi
; NO-FMA-NEXT: callq __powidf2
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movl $3, %edi
; NO-FMA-NEXT: callq __powidf2
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: movl $3, %edi
; NO-FMA-NEXT: callq __powidf2
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -1404,24 +1401,23 @@ entry:
define <3 x double> @constrained_vector_sin_v3f64() {
; NO-FMA-LABEL: constrained_vector_sin_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq sin
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq sin
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq sin
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -1601,24 +1597,23 @@ entry:
define <3 x double> @constrained_vector_cos_v3f64() {
; NO-FMA-LABEL: constrained_vector_cos_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq cos
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq cos
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq cos
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -1798,24 +1793,23 @@ entry:
define <3 x double> @constrained_vector_exp_v3f64() {
; NO-FMA-LABEL: constrained_vector_exp_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -1995,24 +1989,23 @@ entry:
define <3 x double> @constrained_vector_exp2_v3f64() {
; NO-FMA-LABEL: constrained_vector_exp2_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp2
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp2
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq exp2
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -2192,24 +2185,23 @@ entry:
define <3 x double> @constrained_vector_log_v3f64() {
; NO-FMA-LABEL: constrained_vector_log_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -2389,24 +2381,23 @@ entry:
define <3 x double> @constrained_vector_log10_v3f64() {
; NO-FMA-LABEL: constrained_vector_log10_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log10
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log10
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log10
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -2586,24 +2577,23 @@ entry:
define <3 x double> @constrained_vector_log2_v3f64() {
; NO-FMA-LABEL: constrained_vector_log2_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log2
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log2
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq log2
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -2765,24 +2755,23 @@ entry:
define <3 x double> @constrained_vector_rint_v3f64() {
; NO-FMA-LABEL: constrained_vector_rint_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq rint
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq rint
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq rint
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
@@ -2912,24 +2901,23 @@ entry:
define <3 x double> @constrained_vector_nearby_v3f64() {
; NO-FMA-LABEL: constrained_vector_nearby_v3f64:
; NO-FMA: # %bb.0: # %entry
-; NO-FMA-NEXT: subq $24, %rsp
-; NO-FMA-NEXT: .cfi_def_cfa_offset 32
+; NO-FMA-NEXT: subq $56, %rsp
+; NO-FMA-NEXT: .cfi_def_cfa_offset 64
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq nearbyint
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq nearbyint
-; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; NO-FMA-NEXT: callq nearbyint
; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT: movaps %xmm0, %xmm1
-; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT: addq $24, %rsp
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT: addq $56, %rsp
; NO-FMA-NEXT: .cfi_def_cfa_offset 8
; NO-FMA-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll Thu Aug 9 05:30:02 2018
@@ -19,8 +19,8 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -34,7 +34,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -48,7 +48,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -107,8 +107,8 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -153,11 +153,11 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -177,11 +177,11 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -274,8 +274,8 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -345,11 +345,11 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -527,8 +527,8 @@ define i64 @test_v16i64(<16 x i64> %a0)
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -648,11 +648,11 @@ define i64 @test_v16i64(<16 x i64> %a0)
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -779,13 +779,13 @@ define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-LABEL: test_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@@ -829,13 +829,13 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pmuludq %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -901,29 +901,29 @@ define i32 @test_v16i32(<16 x i32> %a0)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
@@ -993,63 +993,63 @@ define i32 @test_v32i32(<32 x i32> %a0)
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE2-NEXT: pmuludq %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT: pmuludq %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v32i32:
@@ -1841,9 +1841,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -1851,15 +1851,14 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -2229,9 +2228,9 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -2239,15 +2238,14 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -2782,9 +2780,9 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -2792,15 +2790,14 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
Modified: llvm/trunk/test/CodeGen/X86/vector-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-sext.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-sext.ll Thu Aug 9 05:30:02 2018
@@ -2190,8 +2190,8 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $6, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $5, %ecx
; SSE2-NEXT: andl $1, %ecx
@@ -2199,9 +2199,9 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $4, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $3, %ecx
; SSE2-NEXT: andl $1, %ecx
@@ -2209,22 +2209,20 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $2, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
@@ -2238,8 +2236,8 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $6, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $5, %ecx
; SSSE3-NEXT: andl $1, %ecx
@@ -2247,9 +2245,9 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $4, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $3, %ecx
; SSSE3-NEXT: andl $1, %ecx
@@ -2257,22 +2255,20 @@ define <8 x i32> @load_sext_8i1_to_8i32(
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $2, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: shrl %eax
; SSSE3-NEXT: andl $1, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: retq
@@ -3017,62 +3013,60 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $10, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $9, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $8, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $7, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $6, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $5, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $4, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $3, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $2, %ecx
; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: psllw $15, %xmm1
; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: retq
@@ -3105,62 +3099,60 @@ define <16 x i16> @load_sext_16i1_to_16i
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $10, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $9, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $8, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $7, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $6, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $5, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $4, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $3, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $2, %ecx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: shrl %eax
; SSSE3-NEXT: andl $1, %eax
; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psllw $15, %xmm0
; SSSE3-NEXT: psraw $15, %xmm0
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: psllw $15, %xmm1
; SSSE3-NEXT: psraw $15, %xmm1
; SSSE3-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Thu Aug 9 05:30:02 2018
@@ -1114,31 +1114,28 @@ define <4 x float> @shuffle_v4f32_0z24(<
; SSE2-LABEL: shuffle_v4f32_0z24:
; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0z24:
; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
-; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: xorps %xmm2, %xmm2
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSE3-NEXT: movaps %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0z24:
; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: xorps %xmm2, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0z24:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Thu Aug 9 05:30:02 2018
@@ -1567,8 +1567,8 @@ define <4 x i64> @insert_dup_mem_v4i64(i
define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1234:
; AVX1: # %bb.0:
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
; AVX1-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll Thu Aug 9 05:30:02 2018
@@ -2443,9 +2443,9 @@ define <8 x i32> @insert_dup_mem_v8i32(i
define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_12345678:
; AVX1: # %bb.0:
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Thu Aug 9 05:30:02 2018
@@ -1849,10 +1849,10 @@ define <4 x i8> @combine_test4c(<4 x i8>
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
@@ -1861,10 +1861,10 @@ define <4 x i8> @combine_test4c(<4 x i8>
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
@@ -2764,33 +2764,12 @@ entry:
}
define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
-; SSE2-LABEL: PR22412:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: movapd %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: PR22412:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSSE3-NEXT: movapd %xmm2, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
-; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
-; SSSE3-NEXT: movaps %xmm3, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: PR22412:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
-; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: retq
+; SSE-LABEL: PR22412:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
;
; AVX1-LABEL: PR22412:
; AVX1: # %bb.0: # %entry
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Thu Aug 9 05:30:02 2018
@@ -5515,22 +5515,18 @@ define <4 x i32> @mul_add_const_v4i64_v4
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm1, %xmm4
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm2, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pmuludq %xmm1, %xmm5
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: pmuludq %xmm4, %xmm6
+; SSE-NEXT: paddq %xmm5, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: paddq %xmm5, %xmm2
+; SSE-NEXT: paddq %xmm6, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm4
+; SSE-NEXT: pmuludq %xmm4, %xmm1
+; SSE-NEXT: pmuludq %xmm3, %xmm4
; SSE-NEXT: paddq %xmm1, %xmm4
; SSE-NEXT: psllq $32, %xmm4
; SSE-NEXT: pmuludq %xmm3, %xmm0
@@ -5559,37 +5555,34 @@ define <4 x i32> @mul_add_self_v4i64_v4i
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psrad $31, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm1, %xmm4
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm0, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: psrad $31, %xmm6
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: psrad $31, %xmm5
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: psrad $31, %xmm7
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE-NEXT: pxor %xmm8, %xmm8
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE-NEXT: pmuludq %xmm1, %xmm6
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; SSE-NEXT: pmuludq %xmm0, %xmm7
+; SSE-NEXT: paddq %xmm6, %xmm7
+; SSE-NEXT: psllq $32, %xmm7
; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: psrlq $32, %xmm0
-; SSE-NEXT: pmuludq %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: paddq %xmm7, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE-NEXT: pmuludq %xmm2, %xmm5
+; SSE-NEXT: paddq %xmm3, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: paddq %xmm0, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: paddq %xmm4, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE-NEXT: paddq %xmm5, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
; SSE-NEXT: paddd %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -5614,22 +5607,18 @@ define <4 x i32> @mul_add_multiuse_v4i64
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm5
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm3, %xmm6
-; SSE-NEXT: paddq %xmm5, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: pmuludq %xmm1, %xmm6
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pmuludq %xmm5, %xmm7
+; SSE-NEXT: paddq %xmm6, %xmm7
+; SSE-NEXT: psllq $32, %xmm7
; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: paddq %xmm6, %xmm3
+; SSE-NEXT: paddq %xmm7, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm2, %xmm5
+; SSE-NEXT: pmuludq %xmm5, %xmm1
+; SSE-NEXT: pmuludq %xmm4, %xmm5
; SSE-NEXT: paddq %xmm1, %xmm5
; SSE-NEXT: psllq $32, %xmm5
; SSE-NEXT: pmuludq %xmm4, %xmm2
Modified: llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll?rev=339335&r1=339334&r2=339335&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll (original)
+++ llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll Thu Aug 9 05:30:02 2018
@@ -1304,59 +1304,59 @@ define void @interleaved_store_vf32_i8_s
; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
-; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
-; AVX1-NEXT: vmovups %ymm1, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm1, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
-; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm1, 32(%rdi)
; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1364,21 +1364,21 @@ define void @interleaved_store_vf32_i8_s
; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
; AVX512: # %bb.0:
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
-; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3
+; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512-NEXT: vzeroupper
@@ -1397,65 +1397,67 @@ define void @interleaved_store_vf64_i8_s
; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10
+; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm10[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm5, %xmm7, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm5, %xmm14, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
-; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm6
-; AVX1-NEXT: vpshufb %xmm5, %xmm15, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm7
-; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT: vmovups %ymm3, 160(%rdi)
+; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm4
+; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm4
+; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm5
+; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmovups %ymm5, 160(%rdi)
; AVX1-NEXT: vmovups %ymm4, 128(%rdi)
-; AVX1-NEXT: vmovups %ymm6, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm3, 96(%rdi)
; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
-; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1463,38 +1465,38 @@ define void @interleaved_store_vf64_i8_s
; AVX2: # %bb.0:
; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm6
+; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
+; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm4
; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi)
-; AVX2-NEXT: vmovdqu %ymm4, 128(%rdi)
+; AVX2-NEXT: vmovdqu %ymm3, 128(%rdi)
; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
-; AVX2-NEXT: vmovdqu %ymm5, 32(%rdi)
-; AVX2-NEXT: vmovdqu %ymm2, 96(%rdi)
+; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm4, 96(%rdi)
; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
More information about the llvm-commits
mailing list