[llvm] r229378 - [x86] Add a generic unpack-targeted lowering technique. This can be used
Chandler Carruth
chandlerc at gmail.com
Mon Feb 16 04:28:18 PST 2015
Author: chandlerc
Date: Mon Feb 16 06:28:18 2015
New Revision: 229378
URL: http://llvm.org/viewvc/llvm-project?rev=229378&view=rev
Log:
[x86] Add a generic unpack-targeted lowering technique. This can be used
to generically lower blends and is particularly nice because it is
available frome SSE2 onward. This removes a lot of the remaining domain
crossing blends in SSE2 code.
I'm hoping to replace some of the "interleaved" lowering hacks with
something closer to this which should be more principled. First, this
needs to learn how to detect and use other interleavings besides that of
the natural type provided. That will be a follow-up patch though.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/sse2.ll
llvm/trunk/test/CodeGen/X86/vector-idiv.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=229378&r1=229377&r2=229378&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Feb 16 06:28:18 2015
@@ -8463,6 +8463,50 @@ static SDValue lowerVectorShuffleAsInser
DAG.getConstant(InsertPSMask, MVT::i8));
}
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction.
+static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+ return M >= 0 && M % Size < Size / 2;
+ });
+ int NumHiInputs = std::count_if(
+ Mask.begin(), Mask.end(), [Size](int M) { return M % Size > Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // We only handle the case where V1 feeds even mask slots and V2 feeds odd
+ // mask slots. We rely on canonicalization to ensure this is the case.
+ if ((i % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ SmallVectorImpl<int> &VMask = (i % 2 == 0) ? V1Mask : V2Mask;
+ VMask[i / 2 + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size;
+ }
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V1,
+ V2);
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -8921,6 +8965,11 @@ static SDValue lowerV4I32VectorShuffle(S
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
Mask, DAG);
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
+ return Unpack;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
@@ -9670,6 +9719,11 @@ static SDValue lowerV8I16VectorShuffle(S
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
Mask, DAG);
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+ return Unpack;
+
int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=229378&r1=229377&r2=229378&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Mon Feb 16 06:28:18 2015
@@ -7,9 +7,10 @@ define <4 x i32> @a(<4 x i32> %i) nounwi
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: a:
@@ -46,10 +47,11 @@ define <4 x i32> @c(<4 x i32> %i, <4 x i
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: c:
@@ -91,13 +93,14 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i
; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; SSE2-NEXT: callq foo
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: addq $40, %rsp
; SSE2-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/sse2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2.ll?rev=229378&r1=229377&r2=229378&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2.ll Mon Feb 16 06:28:18 2015
@@ -315,10 +315,11 @@ define <4 x i32> @test_mul(<4 x i32> %x,
; CHECK: ## BB#0:
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: retl
%m = mul <4 x i32> %x, %y
ret <4 x i32> %m
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv.ll?rev=229378&r1=229377&r2=229378&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv.ll Mon Feb 16 06:28:18 2015
@@ -25,11 +25,12 @@ define <4 x i32> @test1(<4 x i32> %a) {
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: psrld $1, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
@@ -85,20 +86,22 @@ define <8 x i32> @test2(<8 x i32> %a) {
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSE-NEXT: psubd %xmm3, %xmm0
; SSE-NEXT: psrld $1, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm0
; SSE-NEXT: psrld $2, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
@@ -838,21 +841,22 @@ define <4 x i32> @test8(<4 x i32> %a) {
;
; SSE-LABEL: test8:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: paddd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pmuludq %xmm1, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: paddd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: psrld $31, %xmm0
@@ -911,46 +915,48 @@ define <8 x i32> @test9(<8 x i32> %a) {
;
; SSE-LABEL: test9:
; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrad $31, %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm5
; SSE-NEXT: psrad $31, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: paddd %xmm0, %xmm5
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE-NEXT: pmuludq %xmm6, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
; SSE-NEXT: psubd %xmm5, %xmm0
-; SSE-NEXT: paddd %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrld $31, %xmm3
-; SSE-NEXT: psrad $2, %xmm0
-; SSE-NEXT: paddd %xmm3, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: paddd %xmm4, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm6, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: psubd %xmm3, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $31, %xmm2
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: psrad $2, %xmm0
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: psrad $31, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm5
+; SSE-NEXT: paddd %xmm4, %xmm5
+; SSE-NEXT: pmuludq %xmm1, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: psubd %xmm5, %xmm2
+; SSE-NEXT: paddd %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: psrld $31, %xmm1
+; SSE-NEXT: psrad $2, %xmm2
+; SSE-NEXT: paddd %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test9:
@@ -1006,41 +1012,45 @@ define <8 x i32> @test10(<8 x i32> %a) {
;
; SSE-LABEL: test10:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psubd %xmm3, %xmm5
+; SSE-NEXT: psubd %xmm2, %xmm5
; SSE-NEXT: psrld $1, %xmm5
-; SSE-NEXT: paddd %xmm3, %xmm5
+; SSE-NEXT: paddd %xmm2, %xmm5
; SSE-NEXT: psrld $2, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE-NEXT: psubd %xmm5, %xmm0
-; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pmuludq %xmm1, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psubd %xmm2, %xmm4
+; SSE-NEXT: psubd %xmm3, %xmm4
; SSE-NEXT: psrld $1, %xmm4
-; SSE-NEXT: paddd %xmm2, %xmm4
+; SSE-NEXT: paddd %xmm3, %xmm4
; SSE-NEXT: psrld $2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm4
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; SSE-NEXT: psubd %xmm4, %xmm1
; SSE-NEXT: retq
;
@@ -1109,13 +1119,14 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: psrad $31, %xmm6
; SSE-NEXT: pand %xmm2, %xmm6
; SSE-NEXT: paddd %xmm4, %xmm6
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pmuludq %xmm2, %xmm7
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
; SSE-NEXT: psubd %xmm6, %xmm7
; SSE-NEXT: paddd %xmm0, %xmm7
; SSE-NEXT: movdqa %xmm7, %xmm4
@@ -1125,9 +1136,10 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
; SSE-NEXT: pmuludq %xmm4, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
; SSE-NEXT: psubd %xmm7, %xmm0
; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: movdqa %xmm1, %xmm6
@@ -1135,10 +1147,11 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: pand %xmm2, %xmm6
; SSE-NEXT: paddd %xmm3, %xmm6
; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm5, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm6, %xmm2
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm3
@@ -1147,9 +1160,10 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: paddd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: retq
;
@@ -1222,15 +1236,15 @@ define <4 x i32> @PR20355(<4 x i32> %a)
; SSE-NEXT: paddd %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT: psubd %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $31, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: psubd %xmm3, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR20355:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll?rev=229378&r1=229377&r2=229378&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll Mon Feb 16 06:28:18 2015
@@ -549,28 +549,27 @@ define <16 x i8> @shuffle_v16i8_16_17_18
; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
; SSE2: # BB#0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,0,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,0,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -580,9 +579,9 @@ define <16 x i8> @shuffle_v16i8_16_17_18
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=229378&r1=229377&r2=229378&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Mon Feb 16 06:28:18 2015
@@ -275,16 +275,18 @@ define <4 x i32> @combine_bitwise_ops_te
define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test1b:
; SSE2: # BB#0:
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test1b:
; SSSE3: # BB#0:
-; SSSE3-NEXT: andps %xmm1, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test1b:
@@ -313,16 +315,18 @@ define <4 x i32> @combine_bitwise_ops_te
define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test2b:
; SSE2: # BB#0:
-; SSE2-NEXT: orps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test2b:
; SSSE3: # BB#0:
-; SSSE3-NEXT: orps %xmm1, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test2b:
@@ -390,18 +394,18 @@ define <4 x i32> @combine_bitwise_ops_te
define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test4b:
; SSE2: # BB#0:
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test4b:
; SSSE3: # BB#0:
-; SSSE3-NEXT: andps %xmm1, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test4b:
@@ -430,18 +434,18 @@ define <4 x i32> @combine_bitwise_ops_te
define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test5b:
; SSE2: # BB#0:
-; SSE2-NEXT: orps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test5b:
; SSSE3: # BB#0:
-; SSSE3-NEXT: orps %xmm1, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test5b:
@@ -1012,14 +1016,16 @@ define <4 x i32> @combine_nested_undef_t
define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test16:
; SSE2: # BB#0:
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test16:
; SSSE3: # BB#0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test16:
@@ -1171,16 +1177,16 @@ define <4 x i32> @combine_nested_undef_t
define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test21:
; SSE2: # BB#0:
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test21:
; SSSE3: # BB#0:
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test21:
More information about the llvm-commits
mailing list