[llvm] r211889 - [x86] Teach the X86 backend to DAG-combine SSE2 shuffles that are
Chandler Carruth
chandlerc at gmail.com
Fri Jun 27 04:27:53 PDT 2014
Author: chandlerc
Date: Fri Jun 27 06:27:52 2014
New Revision: 211889
URL: http://llvm.org/viewvc/llvm-project?rev=211889&view=rev
Log:
[x86] Teach the X86 backend to DAG-combine SSE2 shuffles that are
trivially redundant.
This fixes several cases in the new vector shuffle lowering algorithm
which would generate redundant shuffle instructions for the sake of
simplicity.
I'm also deleting a testcase which was somewhat ridiculous. It was
checking for a bug in 2007 about incorrectly transforming shuffles by
looking for the string "-86" in the output of a pretty substantial
function. This test case doesn't seem to have any value at this point.
Differential Revision: http://reviews.llvm.org/D4240
Removed:
llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=211889&r1=211888&r2=211889&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jun 27 06:27:52 2014
@@ -19034,6 +19034,95 @@ static SDValue PerformShuffleCombine256(
return SDValue();
}
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ SmallVector<int, 4> Mask;
+ bool IsUnary;
+ bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+ (void)HaveMask;
+ assert(HaveMask);
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ return Mask;
+ case X86ISD::PSHUFLW:
+ Mask.resize(4);
+ return Mask;
+ case X86ISD::PSHUFHW:
+ Mask.erase(Mask.begin(), Mask.begin() + 4);
+ for (int &M : Mask)
+ M -= 4;
+ return Mask;
+ default:
+ llvm_unreachable("No valid shuffle instruction found!");
+ }
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ Mask = getPSHUFShuffleMask(N);
+ assert(Mask.size() == 4);
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue V = N.getOperand(0);
+ switch (N.getOpcode()) {
+ default:
+ break;
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ assert(VT == MVT::v8i16);
+
+ // See if this reduces to a PSHUFD which is no more expensive and can
+ // combine with more operations.
+ if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
+ areAdjacentMasksSequential(Mask)) {
+ int DMask[] = {-1, -1, -1, -1};
+ int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+ DMask[DOffset + 0] = DOffset + Mask[0] / 2;
+ DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+ DCI.AddToWorklist(V.getNode());
+ V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+ getV4X86ShuffleImm8ForMask(DMask, DAG));
+ DCI.AddToWorklist(V.getNode());
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ }
+
+ // Fallthrough
+ case X86ISD::PSHUFD:
+ if (V.getOpcode() == N.getOpcode()) {
+ // If we have two sequential shuffles of the same kind we can always fold
+ // them. Even if there are multiple uses, this is beneficial because it
+ // breaks a dependency.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+ }
+
+ break;
+ }
+
+ return SDValue();
+}
+
/// PerformShuffleCombine - Performs several different shuffle combines.
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -19158,7 +19247,18 @@ static SDValue PerformShuffleCombine(SDN
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
- return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+ SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+ if (LD.getNode())
+ return LD;
+
+ if (isTargetShuffle(N->getOpcode())) {
+ SDValue Shuffle =
+ PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
+ return SDValue();
}
/// PerformTruncateCombine - Converts truncate operation to
Removed: llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll?rev=211888&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll (removed)
@@ -1,30 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep -- -86
-
-define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind {
-entry:
- alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:0 [#uses=167]
- alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:1 [#uses=170]
- alloca [4 x <4 x i32>] ; <[4 x <4 x i32>]*>:2 [#uses=12]
- %.sub6235.i = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0 ; <<4 x float>*> [#uses=76]
- %.sub.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0 ; <<4 x float>*> [#uses=59]
-
- %tmp124.i1062.i = getelementptr <4 x float>* %tmp116117.i1061.i, i32 63 ; <<4 x float>*> [#uses=1]
- %tmp125.i1063.i = load <4 x float>* %tmp124.i1062.i ; <<4 x float>> [#uses=5]
- %tmp828.i1077.i = shufflevector <4 x float> %tmp125.i1063.i, <4 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ; <<4 x float>> [#uses=4]
- %tmp704.i1085.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1]
- %tmp712.i1086.i = call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %tmp704.i1085.i, <4 x float> %tmp828.i1077.i ) ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp712.i1086.i, <4 x float>* %.sub.i
-
- %tmp2587.i1145.gep.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0, i32 2 ; <float*> [#uses=1]
- %tmp5334.i = load float* %tmp2587.i1145.gep.i ; <float> [#uses=5]
- %tmp2723.i1170.i = insertelement <4 x float> undef, float %tmp5334.i, i32 2 ; <<4 x float>> [#uses=5]
- store <4 x float> %tmp2723.i1170.i, <4 x float>* %.sub6235.i
-
- %tmp1406.i1367.i = shufflevector <4 x float> %tmp2723.i1170.i, <4 x float> undef, <4 x i32> < i32 2, i32 2, i32 2, i32 2 > ; <<4 x float>> [#uses=1]
- %tmp84.i1413.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1]
- %tmp89.i1415.i = fmul <4 x float> %tmp84.i1413.i, %tmp1406.i1367.i ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp89.i1415.i, <4 x float>* %.sub.i
- ret i16 0
-}
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=211889&r1=211888&r2=211889&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Fri Jun 27 06:27:52 2014
@@ -51,7 +51,7 @@ define <8 x i16> @shuffle_v8i16_31206745
; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745
; CHECK-SSE2: # BB#0:
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
@@ -159,7 +159,7 @@ define <8 x i16> @shuffle_v8i16_26401375
; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
ret <8 x i16> %shuffle
@@ -273,8 +273,7 @@ define <8 x i16> @shuffle_v8i16_4563XXXX
; CHECK-SSE2: # BB#0:
; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,3,0,1,4,5,6,7]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
@@ -285,8 +284,7 @@ define <8 x i16> @shuffle_v8i16_01274563
; CHECK-SSE2: # BB#0:
; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
@@ -297,8 +295,7 @@ define <8 x i16> @shuffle_v8i16_45630127
; CHECK-SSE2: # BB#0:
; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,3,0,1,4,5,6,7]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
@@ -359,9 +356,8 @@ define <8 x i16> @shuffle_v8i16_08196e7f
define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879
; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[2,3,0,1,4,5,6,7]
; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
@@ -458,8 +454,7 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3
; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
More information about the llvm-commits
mailing list