[llvm] r211890 - [x86] Teach the target-specific combining how to aggressively fold
Chandler Carruth
chandlerc at gmail.com
Fri Jun 27 04:34:41 PDT 2014
Author: chandlerc
Date: Fri Jun 27 06:34:40 2014
New Revision: 211890
URL: http://llvm.org/viewvc/llvm-project?rev=211890&view=rev
Log:
[x86] Teach the target-specific combining how to aggressively fold
half-shuffles, even looking through intervening instructions in a chain.
Summary:
This doesn't happen to show up with any test cases I've found for the current
shuffle lowering, but previous attempts would benefit from this and it seems
generally useful. I've tested it directly using intrinsics, which also shows
that it will work with hand vectorized code as well.
Note that even though pshufd isn't directly used in these tests, it gets
exercised because we combine some of the half shuffles into a pshufd
first, and then merge them.
Differential Revision: http://reviews.llvm.org/D4291
Added:
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=211890&r1=211889&r2=211890&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jun 27 06:34:40 2014
@@ -19061,6 +19061,88 @@ static SmallVector<int, 4> getPSHUFShuff
}
}
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(
+ (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+ unsigned CombineOpcode = N.getOpcode();
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOpcode)
+ break;
+
+ // Other-half shuffles are no-ops.
+ continue;
+
+ case X86ISD::PSHUFD: {
+ // We can only handle pshufd if the half we are combining either stays in
+ // its half, or switches to the other half. Bail if one of these isn't
+ // true.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
+ if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
+ (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
+ return false;
+
+ // Map the mask through the pshufd and keep walking up the chain.
+ for (int i = 0; i < 4; ++i)
+ Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
+
+ // Switch halves if the pshufd does.
+ CombineOpcode =
+ VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ continue;
+ }
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Record the old value to use in RAUW-ing.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask (adjusted to account for all
+ // the pshufd instructions encountered).
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Replace N with its operand as we're going to combine that shuffle away.
+ DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+ return true;
+}
+
/// \brief Try to combine x86 target specific shuffles.
static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -19080,6 +19162,11 @@ static SDValue PerformTargetShuffleCombi
return SDValue();
}
+ // Nuke no-op shuffles that show up after combining.
+ if (isNoopShuffleMask(Mask))
+ return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Look for simplifications involving one or two shuffle instructions.
SDValue V = N.getOperand(0);
switch (N.getOpcode()) {
default:
@@ -19088,6 +19175,9 @@ static SDValue PerformTargetShuffleCombi
case X86ISD::PSHUFHW:
assert(VT == MVT::v8i16);
+ if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle, so we're done.
+
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations.
if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
Added: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=211890&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Fri Jun 27 06:34:40 2014
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
+declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
+
+define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw1
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+ ret <8 x i16> %c
+}
+
+define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw2
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
+ ret <8 x i16> %d
+}
+
+define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw3
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
+ ret <8 x i16> %d
+}
+
+define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufhw1
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
+ ret <8 x i16> %d
+}
+
More information about the llvm-commits
mailing list