[llvm] r260063 - [X86][SSE] Resolve target shuffle inputs to sentinels to permit more combines
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 7 14:51:06 PST 2016
Author: rksimon
Date: Sun Feb 7 16:51:06 2016
New Revision: 260063
URL: http://llvm.org/viewvc/llvm-project?rev=260063&view=rev
Log:
[X86][SSE] Resolve target shuffle inputs to sentinels to permit more combines
The combineX86ShufflesRecursively only supports unary shuffles, but was missing the opportunity to combine binary shuffles with a zero / undef second input.
This patch resolves target shuffle inputs, converting the shuffle mask elements to SM_SentinelUndef/SM_SentinelZero where possible. It then resolves the updated mask to check if we have created a faux unary shuffle.
Additionally, we now attempt to recursively call combineX86ShufflesRecursively for all input operands (we used to just recurse for unary integer shuffles and unary unpacks) - it safely returns early if its not a target shuffle.
Differential Revision: http://reviews.llvm.org/D16683
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-sext.ll
llvm/trunk/test/CodeGen/X86/vector-zext.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=260063&r1=260062&r2=260063&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Feb 7 16:51:06 2016
@@ -5094,7 +5094,9 @@ static bool setTargetShuffleZeroElements
if (M < 0)
continue;
+ // Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
+ M %= Size;
// We are referencing an UNDEF input.
if (V.isUndef()) {
@@ -5102,12 +5104,77 @@ static bool setTargetShuffleZeroElements
continue;
}
- // TODO - handle the Size != (int)V.getNumOperands() cases in future.
- if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the (larger) source
+ // element must be UNDEF/ZERO.
+ // TODO: Is it worth testing the individual bits of a constant?
+ if ((Size % V.getNumOperands()) == 0) {
+ unsigned Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef())
+ Mask[i] = SM_SentinelUndef;
+ else if (X86::isZeroNode(Op))
+ Mask[i] = SM_SentinelZero;
continue;
- if (!X86::isZeroNode(V.getOperand(M % Size)))
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be all UNDEF or all ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ unsigned Scale = V->getNumOperands() / Size;
+ bool AllUndef = true;
+ bool AllZero = true;
+ for (unsigned j = 0; j != Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllUndef &= Op.isUndef();
+ AllZero &= X86::isZeroNode(Op);
+ }
+ if (AllUndef)
+ Mask[i] = SM_SentinelUndef;
+ else if (AllZero)
+ Mask[i] = SM_SentinelZero;
continue;
- Mask[i] = SM_SentinelZero;
+ }
+ }
+
+ return true;
+}
+
+/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
+/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
+/// remaining input indices in case we now have a unary shuffle and adjust the
+/// Op0/Op1 inputs accordingly.
+/// Returns true if the target shuffle mask was decoded.
+static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0,
+ SDValue &Op1,
+ SmallVectorImpl<int> &Mask) {
+ if (!setTargetShuffleZeroElements(Op, Mask))
+ return false;
+
+ int NumElts = Mask.size();
+ bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
+ return 0 <= Idx && Idx < NumElts;
+ });
+ bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
+ [NumElts](int Idx) { return NumElts <= Idx; });
+
+ Op0 = Op0InUse ? Op.getOperand(0) : SDValue();
+ Op1 = Op1InUse ? Op.getOperand(1) : SDValue();
+ IsUnary = !(Op0InUse && Op1InUse);
+
+ if (!IsUnary)
+ return true;
+
+ // We're only using Op1 - commute the mask and inputs.
+ if (!Op0InUse && Op1InUse) {
+ for (int &M : Mask)
+ if (NumElts <= M)
+ M -= NumElts;
+ Op0 = Op1;
+ Op1 = SDValue();
}
return true;
@@ -23278,7 +23345,7 @@ static SDValue PerformShuffleCombine256(
/// \brief Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
-/// This is the leaf of the recursive combinine below. When we have found some
+/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction
@@ -23439,13 +23506,19 @@ static bool combineX86ShuffleChain(SDVal
int NumBytes = VT.getSizeInBits() / 8;
int Ratio = NumBytes / Mask.size();
for (int i = 0; i < NumBytes; ++i) {
- if (Mask[i / Ratio] == SM_SentinelUndef) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
}
- int M = Mask[i / Ratio] != SM_SentinelZero
- ? Ratio * Mask[i / Ratio] + i % Ratio
- : 255;
+ if (M == SM_SentinelZero) {
+ PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ // Check that we are not crossing lanes.
+ if ((M / 16) != (i / 16))
+ return false;
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
@@ -23518,13 +23591,15 @@ static bool combineX86ShufflesRecursivel
assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
"Can only combine shuffles of the same vector register size.");
- if (!isTargetShuffle(Op.getOpcode()))
- return false;
- SmallVector<int, 16> OpMask;
+ // Extract target shuffle mask and resolve sentinels and inputs.
bool IsUnary;
- bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
- // We only can combine unary shuffles which we can decode the mask for.
- if (!HaveMask || !IsUnary)
+ SDValue Input0, Input1;
+ SmallVector<int, 16> OpMask;
+ if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask))
+ return false;
+
+ // At the moment we can only combine target shuffle unary cases.
+ if (!IsUnary)
return false;
assert(VT.getVectorNumElements() == OpMask.size() &&
@@ -23570,32 +23645,25 @@ static bool combineX86ShufflesRecursivel
RootMaskedIdx % OpRatio);
}
- // See if we can recurse into the operand to combine more things.
- switch (Op.getOpcode()) {
- case X86ISD::PSHUFB:
- HasPSHUFB = true;
- case X86ISD::PSHUFD:
- case X86ISD::PSHUFHW:
- case X86ISD::PSHUFLW:
- if (Op.getOperand(0).hasOneUse() &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
-
- case X86ISD::UNPCKL:
- case X86ISD::UNPCKH:
- assert(Op.getOperand(0) == Op.getOperand(1) &&
- "We only combine unary shuffles!");
- // We can't check for single use, we have to check that this shuffle is the
- // only user.
- if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ // Handle the all undef case early.
+ // TODO - should we handle zero/undef case as well? Widening the mask
+ // will lose information on undef elements possibly reducing future
+ // combine possibilities.
+ if (std::all_of(Mask.begin(), Mask.end(),
+ [](int Idx) { return Idx == SM_SentinelUndef; })) {
+ DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
+ return true;
}
+ HasPSHUFB |= (Op.getOpcode() == X86ISD::PSHUFB);
+
+ // See if we can recurse into Input0 (if it's a target shuffle).
+ if (Input0 && Op->isOnlyUserOf(Input0.getNode()) &&
+ combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, HasPSHUFB,
+ DAG, DCI, Subtarget))
+ return true;
+
+
// Minor canonicalization of the accumulated shuffle mask to make it easier
// to match below. All this does is detect masks with sequential pairs of
// elements, and shrink them to the half-width mask. It does this in a loop
Modified: llvm/trunk/test/CodeGen/X86/vector-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-sext.ll?rev=260063&r1=260062&r2=260063&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-sext.ll Sun Feb 7 16:51:06 2016
@@ -143,14 +143,12 @@ define <8 x i32> @sext_16i8_to_8i32(<16
;
; SSSE3-LABEL: sext_16i8_to_8i32:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i32:
Modified: llvm/trunk/test/CodeGen/X86/vector-zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-zext.ll?rev=260063&r1=260062&r2=260063&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-zext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-zext.ll Sun Feb 7 16:51:06 2016
@@ -1205,9 +1205,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
More information about the llvm-commits
mailing list