[llvm] r339686 - [X86][SSE] Add shuffle combine support for OR(PSHUFB, PSHUFB) style patterns.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 14 09:00:06 PDT 2018
Author: rksimon
Date: Tue Aug 14 09:00:05 2018
New Revision: 339686
URL: http://llvm.org/viewvc/llvm-project?rev=339686&view=rev
Log:
[X86][SSE] Add shuffle combine support for OR(PSHUFB,PSHUFB) style patterns.
If each element is zero from one (or both) inputs then we can combine these into a single shuffle mask.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/oddshuffles.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=339686&r1=339685&r2=339686&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Aug 14 09:00:05 2018
@@ -4406,6 +4406,7 @@ static bool isTargetShuffleVariableMask(
case X86ISD::VPERMV3:
return true;
// 'Faux' Target Shuffles.
+ case ISD::OR:
case ISD::AND:
case X86ISD::ANDNP:
return true;
@@ -6166,6 +6167,12 @@ static bool setTargetShuffleZeroElements
return true;
}
+// Forward declaration (for getFauxShuffleMask recursive check).
+static bool resolveTargetShuffleInputs(SDValue Op,
+ SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ const SelectionDAG &DAG);
+
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
@@ -6219,6 +6226,43 @@ static bool getFauxShuffleMask(SDValue N
Ops.push_back(IsAndN ? N1 : N0);
return true;
}
+ case ISD::OR: {
+ // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
+ // is a valid shuffle index.
+ SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
+ SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
+ if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
+ return false;
+ SmallVector<int, 64> SrcMask0, SrcMask1;
+ SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
+ if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
+ !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
+ return false;
+ // TODO - Add support for more than 2 inputs.
+ if ((SrcInputs0.size() + SrcInputs1.size()) > 2)
+ return false;
+ int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
+ SmallVector<int, 64> Mask0, Mask1;
+ scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
+ scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
+ for (int i = 0; i != MaskSize; ++i) {
+ if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
+ Mask.push_back(SM_SentinelUndef);
+ else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
+ Mask.push_back(SM_SentinelZero);
+ else if (Mask1[i] == SM_SentinelZero)
+ Mask.push_back(Mask0[i]);
+ else if (Mask0[i] == SM_SentinelZero)
+ Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
+ else
+ return false;
+ }
+ for (SDValue &Op : SrcInputs0)
+ Ops.push_back(Op);
+ for (SDValue &Op : SrcInputs1)
+ Ops.push_back(Op);
+ return true;
+ }
case ISD::SCALAR_TO_VECTOR: {
// Match against a scalar_to_vector of an extract from a vector,
// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
@@ -35216,6 +35260,15 @@ static SDValue combineOr(SDNode *N, Sele
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
+ // Attempt to recursively combine an OR of shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ return Res;
+ }
+
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();
Modified: llvm/trunk/test/CodeGen/X86/oddshuffles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddshuffles.ll?rev=339686&r1=339685&r2=339686&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddshuffles.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll Tue Aug 14 09:00:05 2018
@@ -745,57 +745,53 @@ define void @interleave_24i8_out(<24 x i
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,1,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; SSE2-NEXT: packuswb %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, (%rsi)
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,1,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm4
+; SSE2-NEXT: movq %xmm4, (%rsi)
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
-; SSE2-NEXT: packuswb %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, (%rdx)
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; SSE2-NEXT: packuswb %xmm0, %xmm4
+; SSE2-NEXT: movq %xmm4, (%rdx)
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll?rev=339686&r1=339685&r2=339686&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll Tue Aug 14 09:00:05 2018
@@ -891,18 +891,12 @@ define <8 x i32> @combine_permd_insertio
define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
; X32-LABEL: combine_pshufb_pshufb_or_as_blend:
; X32: # %bb.0:
-; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; X32-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_as_blend:
; X64: # %bb.0:
-; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; X64-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@@ -913,18 +907,12 @@ define <32 x i8> @combine_pshufb_pshufb_
define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
; X32-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; X32: # %bb.0:
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
-; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; X32-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; X64: # %bb.0:
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
-; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
%2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll?rev=339686&r1=339685&r2=339686&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll Tue Aug 14 09:00:05 2018
@@ -669,42 +669,19 @@ declare <16 x i8> @llvm.x86.sse2.packusw
define <16 x i8> @combine_pshufb_pshufb_or_as_blend(<16 x i8> %a0, <16 x i8> %a1) {
; SSSE3-LABEL: combine_pshufb_pshufb_or_as_blend:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[8,9,10,11,12,13,14,15]
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_pshufb_or_as_blend:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_pshufb_pshufb_or_as_blend:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_pshufb_pshufb_or_as_blend:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: combine_pshufb_pshufb_or_as_blend:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: retq
+; AVX-LABEL: combine_pshufb_pshufb_or_as_blend:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
%3 = or <16 x i8> %1, %2
@@ -712,28 +689,14 @@ define <16 x i8> @combine_pshufb_pshufb_
}
define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
-; SSSE3-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
+; SSE: # %bb.0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
More information about the llvm-commits
mailing list