[llvm] r314337 - [X86][SSE] Pull out variable shuffle mask combine logic. NFCI.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 27 13:19:53 PDT 2017
Author: rksimon
Date: Wed Sep 27 13:19:53 2017
New Revision: 314337
URL: http://llvm.org/viewvc/llvm-project?rev=314337&view=rev
Log:
[X86][SSE] Pull out variable shuffle mask combine logic. NFCI.
Hopefully this will make it easier to vary the combine depth threshold per-target.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=314337&r1=314336&r2=314337&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep 27 13:19:53 2017
@@ -27779,12 +27779,16 @@ static SDValue combineX86ShuffleChain(Ar
if (Depth < 2)
return SDValue();
+ // Depth threshold above which we can efficiently use variable mask shuffles.
+ // TODO This should probably be target specific.
+ bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
+
bool MaskContainsZeros =
any_of(Mask, [](int M) { return M == SM_SentinelZero; });
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
- if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX2() &&
(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasAVX512() &&
@@ -27805,7 +27809,7 @@ static SDValue combineX86ShuffleChain(Ar
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
// vector as the second source.
- if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@@ -27833,7 +27837,7 @@ static SDValue combineX86ShuffleChain(Ar
}
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
- if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@@ -27859,7 +27863,7 @@ static SDValue combineX86ShuffleChain(Ar
// See if we can combine a single input shuffle with zeros to a bit-mask,
// which is much simpler than any shuffle.
- if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+ if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
@@ -27890,7 +27894,7 @@ static SDValue combineX86ShuffleChain(Ar
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
- if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
@@ -27910,7 +27914,7 @@ static SDValue combineX86ShuffleChain(Ar
// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
// to VPERMIL2PD/VPERMIL2PS.
- if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
+ if (AllowVariableMask && Subtarget.hasXOP() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
MaskVT == MVT::v8f32)) {
// VPERMIL2 Operation.
@@ -27952,7 +27956,7 @@ static SDValue combineX86ShuffleChain(Ar
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
- if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ if (UnaryShuffle && AllowVariableMask &&
((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
@@ -27970,7 +27974,7 @@ static SDValue combineX86ShuffleChain(Ar
continue;
}
M = Ratio * M + i % Ratio;
- assert ((M / 16) == (i / 16) && "Lane crossing detected");
+ assert((M / 16) == (i / 16) && "Lane crossing detected");
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
@@ -27986,8 +27990,7 @@ static SDValue combineX86ShuffleChain(Ar
// With XOP, if we have a 128-bit binary input shuffle we can always combine
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
// slower than PSHUFB on targets that support both.
- if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
- Subtarget.hasXOP()) {
+ if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
// VPPERM Mask Operation
// Bits[4:0] - Byte Index (0 - 31)
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
More information about the llvm-commits
mailing list